From 1fa6c257a372c5912aff0d4cf4439858a4bf86b8 Mon Sep 17 00:00:00 2001 From: Connor1996 Date: Mon, 3 Jul 2023 20:40:00 -0700 Subject: [PATCH 01/32] add double write env Signed-off-by: Connor1996 --- Cargo.toml | 20 +- src/env/double_write.rs | 403 ++++++++++++++++++++++++++++++++++++++++ src/env/mod.rs | 1 + 3 files changed, 411 insertions(+), 13 deletions(-) create mode 100644 src/env/double_write.rs diff --git a/Cargo.toml b/Cargo.toml index ea8107df..3e9af25c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -35,8 +35,11 @@ required-features = ["failpoints"] byteorder = "1.2" crc32fast = "1.2" crossbeam = "0.8" +crossbeam-channel = "0.5.8" +either = "1.8.1" fail = "0.5" fs2 = "0.4" +futures = "0.3.28" hashbrown = "0.14" hex = "0.4" if_chain = "1.0" @@ -73,19 +76,10 @@ toml = "0.7" [features] internals = [] -nightly = [ - "prometheus/nightly", -] -failpoints = [ - "fail/failpoints", -] -scripting = [ - "rhai", -] -swap = [ - "nightly", - "memmap2", -] +nightly = ["prometheus/nightly"] +failpoints = ["fail/failpoints"] +scripting = ["rhai"] +swap = ["nightly", "memmap2"] # Shortcuts all_except_failpoints = ["internals", "scripting", "nightly", "swap"] diff --git a/src/env/double_write.rs b/src/env/double_write.rs new file mode 100644 index 00000000..90f3c67e --- /dev/null +++ b/src/env/double_write.rs @@ -0,0 +1,403 @@ +// Copyright (c) 2017-present, PingCAP, Inc. Licensed under Apache-2.0. + +use crossbeam::channel::unbounded; +use crossbeam::channel::Sender; +use log::{warn, Log}; +use std::io::{Read, Result as IoResult, Seek, SeekFrom, Write}; +use std::path::Path; +use std::path::PathBuf; +use std::sync::Arc; +use std::thread; + +use crate::env::default::LogFd; +use crate::env::DefaultFileSystem; +use crate::env::{FileSystem, Handle, Permission, WriteExt}; +use futures::channel::oneshot; +use futures::executor::block_on; +use futures::select; + +use either::Either; + +type Callback = Box>) + Send>; + +#[derive(PartialEq)] +enum Task { + Create(PathBuf), + Open { + path: PathBuf, + perm: Permission, + }, + Delete(PathBuf), + Rename { + src_path: PathBuf, + dst_path: PathBuf, + }, + Stop, +} + +pub struct DoubleWriteFileSystem { + path1: PathBuf, + path2: PathBuf, + disk1: Sender<(Task, Callback)>, + disk2: Sender<(Task, Callback)>, + + handle1: Option>, + handle2: Option>, +} + +impl DoubleWriteFileSystem { + fn new(path1: PathBuf, path2: PathBuf) -> Self { + let (tx1, rx1) = unbounded::<(Task, Callback)>(); + let (tx2, rx2) = unbounded::<(Task, Callback)>(); + let handle1 = thread::spawn(|| { + let fs = DefaultFileSystem {}; + for (task, cb) in rx1 { + if task == Task::Stop { + break; + } + let res = Self::handle(&fs, task); + cb(res); + } + }); + let handle2 = thread::spawn(|| { + let fs = DefaultFileSystem {}; + for (task, cb) in rx2 { + if task == Task::Stop { + break; + } + let res = Self::handle(&fs, task); + cb(res); + } + }); + Self { + path1, + path2, + disk1: tx1, + disk2: tx2, + handle1: Some(handle1), + handle2: Some(handle2), + } + } + + async fn wait_handle(&self, task1: Task, task2: Task) -> IoResult { + let (cb1, mut f1) = paired_future_callback(); + let (cb2, mut f2) = paired_future_callback(); + self.disk1.send((task1, cb1)); + self.disk2.send((task2, cb2)); + + select! { + res1 = f1 => res1.unwrap().map(|h| DoubleWriteHandle::new( + Either::Right(h.unwrap()), Either::Left(f2) )), + res2 = f2 => res2.unwrap().map(|h| DoubleWriteHandle::new( + Either::Left(f1), Either::Right(h.unwrap()) )), + } + } + + async fn wait_one(&self, task1: Task, task2: Task) -> IoResult<()> { + let (cb1, mut f1) = paired_future_callback(); + let (cb2, mut f2) = paired_future_callback(); + self.disk1.send((task1, cb1)); + self.disk2.send((task2, cb2)); + + select! { + res1 = f1 => res1.unwrap().map(|_| ()), + res2 = f2 => res2.unwrap().map(|_| ()), + } + } + + fn replace_path(&self, path: &Path) -> PathBuf { + if let Ok(file) = path.strip_prefix(&self.path1) { + self.path2.clone().join(file) + } else { + panic!("Invalid path: {:?}", path); + } + } + + #[inline] + fn handle(file_system: &DefaultFileSystem, task: Task) -> IoResult> { + match task { + Task::Create(path) => file_system.create(path).map(|h| Some(h)), + Task::Open { path, perm } => file_system.open(path, perm).map(|h| Some(h)), + Task::Delete(path) => file_system.delete(path).map(|_| None), + Task::Rename { src_path, dst_path } => { + file_system.rename(src_path, dst_path).map(|_| None) + } + Task::Stop => unreachable!(), + } + } +} + +impl Drop for DoubleWriteFileSystem { + fn drop(&mut self) { + self.disk1.send((Task::Stop, Box::new(|_| {}))).unwrap(); + self.disk2.send((Task::Stop, Box::new(|_| {}))).unwrap(); + self.handle1.take().unwrap().join().unwrap(); + self.handle2.take().unwrap().join().unwrap(); + } +} + +impl FileSystem for DoubleWriteFileSystem { + type Handle = DoubleWriteHandle; + type Reader = DoubleWriteReader; + type Writer = DoubleWriteWriter; + + fn create>(&self, path: P) -> IoResult { + block_on(self.wait_handle( + Task::Create(path.as_ref().to_path_buf()), + Task::Create(self.replace_path(path.as_ref())), + )) + } + + fn open>(&self, path: P, perm: Permission) -> IoResult { + block_on(self.wait_handle( + Task::Open { + path: path.as_ref().to_path_buf(), + perm, + }, + Task::Open { + path: self.replace_path(path.as_ref()), + perm, + }, + )) + } + + fn delete>(&self, path: P) -> IoResult<()> { + block_on(self.wait_one( + Task::Delete(path.as_ref().to_path_buf()), + Task::Delete(self.replace_path(path.as_ref())), + )) + } + + fn rename>(&self, src_path: P, dst_path: P) -> IoResult<()> { + block_on(self.wait_one( + Task::Rename { + src_path: src_path.as_ref().to_path_buf(), + dst_path: dst_path.as_ref().to_path_buf(), + }, + Task::Rename { + src_path: self.replace_path(src_path.as_ref()), + dst_path: self.replace_path(dst_path.as_ref()), + }, + )) + } + + fn new_reader(&self, handle: Arc) -> IoResult { + Ok(DoubleWriteReader::new(handle)) + } + + fn new_writer(&self, handle: Arc) -> IoResult { + Ok(DoubleWriteWriter::new(handle)) + } +} + +#[derive(Clone, PartialEq)] +enum FileTask { + Truncate(usize), + FileSize, + Sync, + Write { offset: usize, bytes: Vec }, + Allocate { offset: usize, size: usize }, + Stop, +} + +pub struct DoubleWriteHandle { + disk1: Sender<(FileTask, Callback)>, + disk2: Sender<(FileTask, Callback)>, + + handle1: Option>, + handle2: Option>, +} + +impl DoubleWriteHandle { + pub fn new( + file1: Either>>, LogFd>, + file2: Either>>, LogFd>, + ) -> Self { + let (tx1, rx1) = unbounded::<(FileTask, Callback)>(); + let (tx2, rx2) = unbounded::<(FileTask, Callback)>(); + let handle1 = thread::spawn(|| { + let fd = Self::resolve(file1); + for (task, cb) in rx1 { + if task == FileTask::Stop { + break; + } + let res = Self::handle(&fd, task); + cb(res); + } + }); + let handle2 = thread::spawn(|| { + let fd = Self::resolve(file2); + for (task, cb) in rx2 { + if task == FileTask::Stop { + break; + } + let res = Self::handle(&fd, task); + cb(res); + } + }); + Self { + disk1: tx1, + disk2: tx2, + handle1: Some(handle1), + handle2: Some(handle2), + } + } + + fn resolve(file: Either>>, LogFd>) -> LogFd { + match file { + Either::Left(f) => { + // TODO: should we handle the second disk io error + block_on(f).unwrap().unwrap().unwrap() + } + Either::Right(fd) => fd, + } + } + + fn handle(fd: &LogFd, task: FileTask) -> IoResult> { + match task { + FileTask::Truncate(offset) => fd.truncate(offset).map(|_| None), + FileTask::FileSize => fd.file_size().map(|x| Some(x)), + FileTask::Sync => fd.sync().map(|_| None), + FileTask::Write { offset, bytes } => fd.write(offset, &bytes).map(|x| Some(x)), + FileTask::Allocate { offset, size } => fd.allocate(offset, size).map(|_| None), + FileTask::Stop => unreachable!(), + } + } + + fn read(&self, offset: usize, buf: &mut [u8]) -> IoResult { + // TODO: + unimplemented!() + } + + fn write(&self, offset: usize, content: &[u8]) -> IoResult { + block_on(self.wait_one(FileTask::Write { + offset, + bytes: content.to_vec(), + })) + .map(|x| x.unwrap()) + } + + fn allocate(&self, offset: usize, size: usize) -> IoResult<()> { + block_on(self.wait_one(FileTask::Allocate { offset, size })).map(|_| ()) + } + + async fn wait_one(&self, task: FileTask) -> IoResult> { + let (cb1, mut f1) = paired_future_callback(); + let (cb2, mut f2) = paired_future_callback(); + self.disk1.send((task.clone(), cb1)).unwrap(); + self.disk2.send((task, cb2)).unwrap(); + + select! { + res1 = f1 => res1.unwrap(), + res2 = f2 => res2.unwrap(), + } + } +} + +impl Handle for DoubleWriteHandle { + fn truncate(&self, offset: usize) -> IoResult<()> { + block_on(self.wait_one(FileTask::Truncate(offset))).map(|_| ()) + } + + fn file_size(&self) -> IoResult { + block_on(self.wait_one(FileTask::FileSize)).map(|x| x.unwrap()) + } + + fn sync(&self) -> IoResult<()> { + block_on(self.wait_one(FileTask::Sync)).map(|_| ()) + } +} + +pub struct DoubleWriteWriter { + inner: Arc, + offset: usize, +} + +impl DoubleWriteWriter { + pub fn new(handle: Arc) -> Self { + Self { + inner: handle, + offset: 0, + } + } +} + +impl Write for DoubleWriteWriter { + fn write(&mut self, buf: &[u8]) -> IoResult { + let len = self.inner.write(self.offset, buf)?; + self.offset += len; + Ok(len) + } + + fn flush(&mut self) -> IoResult<()> { + Ok(()) + } +} + +impl WriteExt for DoubleWriteWriter { + fn truncate(&mut self, offset: usize) -> IoResult<()> { + self.inner.truncate(offset)?; + self.offset = offset; + Ok(()) + } + + fn allocate(&mut self, offset: usize, size: usize) -> IoResult<()> { + self.inner.allocate(offset, size) + } +} + +impl Seek for DoubleWriteWriter { + fn seek(&mut self, pos: SeekFrom) -> IoResult { + match pos { + SeekFrom::Start(offset) => self.offset = offset as usize, + SeekFrom::Current(i) => self.offset = (self.offset as i64 + i) as usize, + SeekFrom::End(i) => self.offset = (self.inner.file_size()? as i64 + i) as usize, + } + Ok(self.offset as u64) + } +} + +pub struct DoubleWriteReader { + inner: Arc, + offset: usize, +} + +impl DoubleWriteReader { + pub fn new(handle: Arc) -> Self { + Self { + inner: handle, + offset: 0, + } + } +} + +impl Seek for DoubleWriteReader { + fn seek(&mut self, pos: SeekFrom) -> IoResult { + match pos { + SeekFrom::Start(offset) => self.offset = offset as usize, + SeekFrom::Current(i) => self.offset = (self.offset as i64 + i) as usize, + SeekFrom::End(i) => self.offset = (self.inner.file_size()? as i64 + i) as usize, + } + Ok(self.offset as u64) + } +} + +impl Read for DoubleWriteReader { + fn read(&mut self, buf: &mut [u8]) -> IoResult { + let len = self.inner.read_impl(self.offset, buf)?; + self.offset += len; + Ok(len) + } +} + +pub fn paired_future_callback( +) -> (Callback, oneshot::Receiver>>) { + let (tx, future) = oneshot::channel(); + let callback = Box::new(move |result| { + let r = tx.send(result); + if r.is_err() { + warn!("paired_future_callback: Failed to send result to the future rx, discarded."); + } + }); + (callback, future) +} diff --git a/src/env/mod.rs b/src/env/mod.rs index 3e24d2be..be981730 100644 --- a/src/env/mod.rs +++ b/src/env/mod.rs @@ -5,6 +5,7 @@ use std::path::Path; use std::sync::Arc; mod default; +mod double_write; mod obfuscated; pub use default::DefaultFileSystem; From 4d22a399e931c30d2b976991dad15a2cce4f4f30 Mon Sep 17 00:00:00 2001 From: Connor1996 Date: Thu, 6 Jul 2023 10:43:34 -0700 Subject: [PATCH 02/32] add test Signed-off-by: Connor1996 --- src/engine.rs | 110 ++++++++++++++++++++++++ src/env/double_write.rs | 183 ++++++++++++++++++++++++++++++++++------ src/env/mod.rs | 1 + 3 files changed, 267 insertions(+), 27 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index 42e55821..ce4587ff 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -11,6 +11,7 @@ use log::{error, info}; use protobuf::{parse_from_bytes, Message}; use crate::config::{Config, RecoveryMode}; +use crate::env::DoubleWriteFileSystem; use crate::consistency::ConsistencyChecker; use crate::env::{DefaultFileSystem, FileSystem}; use crate::event_listener::EventListener; @@ -2597,6 +2598,115 @@ pub(crate) mod tests { assert!(engine.raft_groups().is_empty()); } + #[test] + fn test_start_engine_with_second_disk() { + let dir = tempfile::Builder::new() + .prefix("test_start_engine_with_multi_dirs_default") + .tempdir() + .unwrap(); + let sec_dir = tempfile::Builder::new() + .prefix("test_start_engine_with_multi_dirs_second") + .tempdir() + .unwrap(); + fn number_of_files(p: &Path) -> usize { + let mut r = 0; + std::fs::read_dir(p).unwrap().for_each(|e| { + if e.unwrap() + .path() + .file_name() + .unwrap() + .to_str() + .unwrap() + .starts_with("000") + { + r += 1; + } + }); + r + } + let file_system = Arc::new(DoubleWriteFileSystem::new(dir.path().to_path_buf(), sec_dir.path().to_path_buf())); + let entry_data = vec![b'x'; 512]; + + // Preparations for multi-dirs. + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + enable_log_recycle: false, + target_file_size: ReadableSize(1), + ..Default::default() + }; + + // Step 1: write data into the main directory. + let engine = + RaftLogEngine::open_with_file_system(cfg.clone(), file_system.clone()).unwrap(); + for rid in 1..=10 { + engine.append(rid, 1, 10, Some(&entry_data)); + } + drop(engine); + + // Restart the engine with recycle and prefill. Test reusing files from both + // dirs. + let cfg_2 = Config { + enable_log_recycle: true, + prefill_for_recycle: true, + purge_threshold: ReadableSize(40), + ..cfg.clone() + }; + let engine = RaftLogEngine::open_with_file_system(cfg_2, file_system.clone()).unwrap(); + assert_eq!(number_of_files(sec_dir.path()), number_of_files(dir.path())); + for rid in 1..=10 { + assert_eq!(engine.first_index(rid).unwrap(), 1); + engine.clean(rid); + } + engine.purge_manager.must_rewrite_append_queue(None, None); + let file_count = number_of_files(dir.path()); + assert_eq!(number_of_files(sec_dir.path()), file_count); + assert!(file_count > engine.file_count(None)); + // Append data, recycled files are reused. + for rid in 1..=30 { + engine.append(rid, 20, 30, Some(&entry_data)); + } + // No new file is created. + let file_count1 = number_of_files(dir.path()); + assert_eq!(file_count, file_count1); + assert_eq!(number_of_files(sec_dir.path()), file_count1); + + // let cfg_3 = Config { + // enable_log_recycle: false, + // purge_threshold: ReadableSize(40), + // ..cfg + // }; + // drop(engine); + // let engine = RaftLogEngine::open_with_file_system(cfg_3, file_system).unwrap(); + // assert!(number_of_files(spill_dir.path()) > 0); + // for rid in 1..=10 { + // assert_eq!(engine.first_index(rid).unwrap(), 20); + // } + + // // abnormal case - duplicate FileSeq among different dirs. + // { + // // Prerequisite: choose several files and duplicate them to main dir. + // let mut file_count = 0; + // for e in std::fs::read_dir(spill_dir.path()).unwrap() { + // let p = e.unwrap().path(); + // let file_name = p.file_name().unwrap().to_str().unwrap(); + // if let Some(FileId { + // queue: LogQueue::Append, + // seq: _, + // }) = FileId::parse_file_name(file_name) + // { + // if file_count % 2 == 0 { + // std::fs::copy(&p, dir.path().join(file_name)).unwrap(); + // } + // file_count += 1; + // } + // } + // } + // let start = engine.file_span(LogQueue::Append).0; + // let engine = engine.reopen(); + // // Duplicate log files will be skipped and cleared. + // assert!(engine.file_span(LogQueue::Append).0 > start); + } + #[test] fn test_start_engine_with_multi_dirs() { let dir = tempfile::Builder::new() diff --git a/src/env/double_write.rs b/src/env/double_write.rs index 90f3c67e..63632994 100644 --- a/src/env/double_write.rs +++ b/src/env/double_write.rs @@ -1,12 +1,21 @@ // Copyright (c) 2017-present, PingCAP, Inc. Licensed under Apache-2.0. +use crate::file_pipe_log::FileNameExt; +use crate::internals::parse_reserved_file_name; +use crate::internals::FileId; +use crate::internals::FileSeq; +use crate::internals::LogQueue; use crossbeam::channel::unbounded; use crossbeam::channel::Sender; -use log::{warn, Log}; +use log::{info, warn}; +use std::fs; use std::io::{Read, Result as IoResult, Seek, SeekFrom, Write}; use std::path::Path; use std::path::PathBuf; +use std::sync::atomic::AtomicU64; +use std::sync::atomic::Ordering; use std::sync::Arc; +use std::sync::RwLock; use std::thread; use crate::env::default::LogFd; @@ -46,7 +55,11 @@ pub struct DoubleWriteFileSystem { } impl DoubleWriteFileSystem { - fn new(path1: PathBuf, path2: PathBuf) -> Self { + pub fn new(path1: PathBuf, path2: PathBuf) -> Self { + // TODO: check sync + // assume they are synced now. + Self::check_sync(&path1, &path2); + let (tx1, rx1) = unbounded::<(Task, Callback)>(); let (tx2, rx2) = unbounded::<(Task, Callback)>(); let handle1 = thread::spawn(|| { @@ -79,11 +92,88 @@ impl DoubleWriteFileSystem { } } + fn check_sync(path1: &PathBuf, path2: &PathBuf) { + let check = |path: &PathBuf| -> (Vec, Vec, Vec) { + let mut append_file_names = vec![]; + let mut rewrite_file_names = vec![]; + let mut recycled_file_names = vec![]; + if !path.exists() { + info!("Create raft log directory: {}", path.display()); + fs::create_dir(path).unwrap(); + } + + fs::read_dir(path) + .unwrap() + .try_for_each(|e| -> IoResult<()> { + let dir_entry = e?; + let p = dir_entry.path(); + if !p.is_file() { + return Ok(()); + } + let file_name = p.file_name().unwrap().to_str().unwrap(); + match FileId::parse_file_name(file_name) { + Some(FileId { + queue: LogQueue::Append, + seq, + }) => append_file_names.push(seq), + Some(FileId { + queue: LogQueue::Rewrite, + seq, + }) => rewrite_file_names.push(seq), + _ => { + if let Some(seq) = parse_reserved_file_name(file_name) { + recycled_file_names.push(seq); + } + } + } + Ok(()) + }) + .unwrap(); + append_file_names.sort(); + rewrite_file_names.sort(); + recycled_file_names.sort(); + (append_file_names, rewrite_file_names, recycled_file_names) + }; + + let (append1, rewrite1, recycle1) = check(path1); + let (append2, rewrite2, recycle2) = check(path2); + if append1.first() != append2.first() { + panic!("Append file seq not match: {:?} vs {:?}", append1, append2); + } + if append1.last() != append2.last() { + panic!("Append file seq not match: {:?} vs {:?}", append1, append2); + } + if rewrite1.first() != rewrite2.first() { + panic!( + "Rewrite file seq not match: {:?} vs {:?}", + rewrite1, rewrite2 + ); + } + if rewrite1.last() != rewrite2.last() { + panic!( + "Rewrite file seq not match: {:?} vs {:?}", + rewrite1, rewrite2 + ); + } + if recycle1.first() != recycle2.first() { + panic!( + "Recycle file seq not match: {:?} vs {:?}", + recycle1, recycle2 + ); + } + if recycle1.last() != recycle2.last() { + panic!( + "Recycle file seq not match: {:?} vs {:?}", + recycle1, recycle2 + ); + } + } + async fn wait_handle(&self, task1: Task, task2: Task) -> IoResult { let (cb1, mut f1) = paired_future_callback(); let (cb2, mut f2) = paired_future_callback(); - self.disk1.send((task1, cb1)); - self.disk2.send((task2, cb2)); + self.disk1.send((task1, cb1)).unwrap(); + self.disk2.send((task2, cb2)).unwrap(); select! { res1 = f1 => res1.unwrap().map(|h| DoubleWriteHandle::new( @@ -96,8 +186,8 @@ impl DoubleWriteFileSystem { async fn wait_one(&self, task1: Task, task2: Task) -> IoResult<()> { let (cb1, mut f1) = paired_future_callback(); let (cb2, mut f2) = paired_future_callback(); - self.disk1.send((task1, cb1)); - self.disk2.send((task2, cb2)); + self.disk1.send((task1, cb1)).unwrap(); + self.disk2.send((task2, cb2)).unwrap(); select! { res1 = f1 => res1.unwrap().map(|_| ()), @@ -203,6 +293,10 @@ enum FileTask { pub struct DoubleWriteHandle { disk1: Sender<(FileTask, Callback)>, disk2: Sender<(FileTask, Callback)>, + counter1: Arc, + counter2: Arc, + fd1: Arc>>>, + fd2: Arc>>>, handle1: Option>, handle2: Option>, @@ -215,29 +309,50 @@ impl DoubleWriteHandle { ) -> Self { let (tx1, rx1) = unbounded::<(FileTask, Callback)>(); let (tx2, rx2) = unbounded::<(FileTask, Callback)>(); - let handle1 = thread::spawn(|| { - let fd = Self::resolve(file1); - for (task, cb) in rx1 { - if task == FileTask::Stop { - break; + let counter1 = Arc::new(AtomicU64::new(0)); + let counter2 = Arc::new(AtomicU64::new(0)); + let fd1 = Arc::new(RwLock::new(None)); + let fd2 = Arc::new(RwLock::new(None)); + + let handle1 = { + let fd1 = fd1.clone(); + let counter1 = counter1.clone(); + thread::spawn(move || { + let fd = Arc::new(Self::resolve(file1)); + fd1.write().unwrap().replace(fd.clone()); + for (task, cb) in rx1 { + if task == FileTask::Stop { + break; + } + let res = Self::handle(&fd, task); + counter1.fetch_add(1, Ordering::Relaxed); + cb(res); } - let res = Self::handle(&fd, task); - cb(res); - } - }); - let handle2 = thread::spawn(|| { - let fd = Self::resolve(file2); - for (task, cb) in rx2 { - if task == FileTask::Stop { - break; + }) + }; + let handle2 = { + let fd2 = fd2.clone(); + let counter2 = counter2.clone(); + thread::spawn(move || { + let fd = Arc::new(Self::resolve(file2)); + fd2.write().unwrap().replace(fd.clone()); + for (task, cb) in rx2 { + if task == FileTask::Stop { + break; + } + let res = Self::handle(&fd, task); + counter2.fetch_add(1, Ordering::Relaxed); + cb(res); } - let res = Self::handle(&fd, task); - cb(res); - } - }); + }) + }; Self { disk1: tx1, disk2: tx2, + counter1, + counter2, + fd1, + fd2, handle1: Some(handle1), handle2: Some(handle2), } @@ -265,8 +380,13 @@ impl DoubleWriteHandle { } fn read(&self, offset: usize, buf: &mut [u8]) -> IoResult { - // TODO: - unimplemented!() + // TODO: read simultaneously from both disks + // choose latest to perform read + if self.counter1.load(Ordering::Relaxed) >= self.counter2.load(Ordering::Relaxed) { + self.fd1.read().unwrap().as_ref().unwrap().read(offset, buf) + } else { + self.fd2.read().unwrap().as_ref().unwrap().read(offset, buf) + } } fn write(&self, offset: usize, content: &[u8]) -> IoResult { @@ -294,6 +414,15 @@ impl DoubleWriteHandle { } } +impl Drop for DoubleWriteHandle { + fn drop(&mut self) { + self.disk1.send((FileTask::Stop, Box::new(|_| {}))).unwrap(); + self.disk2.send((FileTask::Stop, Box::new(|_| {}))).unwrap(); + self.handle1.take().unwrap().join().unwrap(); + self.handle2.take().unwrap().join().unwrap(); + } +} + impl Handle for DoubleWriteHandle { fn truncate(&self, offset: usize) -> IoResult<()> { block_on(self.wait_one(FileTask::Truncate(offset))).map(|_| ()) @@ -384,7 +513,7 @@ impl Seek for DoubleWriteReader { impl Read for DoubleWriteReader { fn read(&mut self, buf: &mut [u8]) -> IoResult { - let len = self.inner.read_impl(self.offset, buf)?; + let len = self.inner.read(self.offset, buf)?; self.offset += len; Ok(len) } diff --git a/src/env/mod.rs b/src/env/mod.rs index be981730..3d47e095 100644 --- a/src/env/mod.rs +++ b/src/env/mod.rs @@ -10,6 +10,7 @@ mod obfuscated; pub use default::DefaultFileSystem; pub use obfuscated::ObfuscatedFileSystem; +pub use double_write::DoubleWriteFileSystem; #[derive(Clone, Copy, PartialEq, Eq, Debug)] pub enum Permission { From 6297cff87ee62087bb2617b0b190401589c91cac Mon Sep 17 00:00:00 2001 From: Connor1996 Date: Thu, 6 Jul 2023 14:41:39 -0700 Subject: [PATCH 03/32] add failpoint Signed-off-by: Connor1996 --- src/engine.rs | 44 ++------------ src/env/default.rs | 1 + src/env/double_write.rs | 16 ++++- src/env/mod.rs | 2 +- tests/failpoints/test_engine.rs | 101 ++++++++++++++++++++++++++++++++ 5 files changed, 122 insertions(+), 42 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index ce4587ff..be82c6da 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -11,7 +11,6 @@ use log::{error, info}; use protobuf::{parse_from_bytes, Message}; use crate::config::{Config, RecoveryMode}; -use crate::env::DoubleWriteFileSystem; use crate::consistency::ConsistencyChecker; use crate::env::{DefaultFileSystem, FileSystem}; use crate::event_listener::EventListener; @@ -2624,7 +2623,10 @@ pub(crate) mod tests { }); r } - let file_system = Arc::new(DoubleWriteFileSystem::new(dir.path().to_path_buf(), sec_dir.path().to_path_buf())); + let file_system = Arc::new(DoubleWriteFileSystem::new( + dir.path().to_path_buf(), + sec_dir.path().to_path_buf(), + )); let entry_data = vec![b'x'; 512]; // Preparations for multi-dirs. @@ -2658,7 +2660,7 @@ pub(crate) mod tests { engine.clean(rid); } engine.purge_manager.must_rewrite_append_queue(None, None); - let file_count = number_of_files(dir.path()); + let file_count = number_of_files(dir.path()); assert_eq!(number_of_files(sec_dir.path()), file_count); assert!(file_count > engine.file_count(None)); // Append data, recycled files are reused. @@ -2669,42 +2671,6 @@ pub(crate) mod tests { let file_count1 = number_of_files(dir.path()); assert_eq!(file_count, file_count1); assert_eq!(number_of_files(sec_dir.path()), file_count1); - - // let cfg_3 = Config { - // enable_log_recycle: false, - // purge_threshold: ReadableSize(40), - // ..cfg - // }; - // drop(engine); - // let engine = RaftLogEngine::open_with_file_system(cfg_3, file_system).unwrap(); - // assert!(number_of_files(spill_dir.path()) > 0); - // for rid in 1..=10 { - // assert_eq!(engine.first_index(rid).unwrap(), 20); - // } - - // // abnormal case - duplicate FileSeq among different dirs. - // { - // // Prerequisite: choose several files and duplicate them to main dir. - // let mut file_count = 0; - // for e in std::fs::read_dir(spill_dir.path()).unwrap() { - // let p = e.unwrap().path(); - // let file_name = p.file_name().unwrap().to_str().unwrap(); - // if let Some(FileId { - // queue: LogQueue::Append, - // seq: _, - // }) = FileId::parse_file_name(file_name) - // { - // if file_count % 2 == 0 { - // std::fs::copy(&p, dir.path().join(file_name)).unwrap(); - // } - // file_count += 1; - // } - // } - // } - // let start = engine.file_span(LogQueue::Append).0; - // let engine = engine.reopen(); - // // Duplicate log files will be skipped and cleared. - // assert!(engine.file_span(LogQueue::Append).0 > start); } #[test] diff --git a/src/env/default.rs b/src/env/default.rs index 9839e668..c7bdd6f7 100644 --- a/src/env/default.rs +++ b/src/env/default.rs @@ -37,6 +37,7 @@ impl From for OFlag { /// supported on *Unix*, and primarily optimized for *Linux*. /// /// All [`LogFd`] instances are opened with read and write permission. +#[derive(Debug)] pub struct LogFd(RawFd); impl LogFd { diff --git a/src/env/double_write.rs b/src/env/double_write.rs index 63632994..268b945f 100644 --- a/src/env/double_write.rs +++ b/src/env/double_write.rs @@ -7,6 +7,7 @@ use crate::internals::FileSeq; use crate::internals::LogQueue; use crossbeam::channel::unbounded; use crossbeam::channel::Sender; +use fail::fail_point; use log::{info, warn}; use std::fs; use std::io::{Read, Result as IoResult, Seek, SeekFrom, Write}; @@ -56,7 +57,7 @@ pub struct DoubleWriteFileSystem { impl DoubleWriteFileSystem { pub fn new(path1: PathBuf, path2: PathBuf) -> Self { - // TODO: check sync + // TODO: fix unsynced files // assume they are synced now. Self::check_sync(&path1, &path2); @@ -68,6 +69,7 @@ impl DoubleWriteFileSystem { if task == Task::Stop { break; } + fail_point!("double_write::thread1"); let res = Self::handle(&fs, task); cb(res); } @@ -382,7 +384,17 @@ impl DoubleWriteHandle { fn read(&self, offset: usize, buf: &mut [u8]) -> IoResult { // TODO: read simultaneously from both disks // choose latest to perform read - if self.counter1.load(Ordering::Relaxed) >= self.counter2.load(Ordering::Relaxed) { + let count1 = self.counter1.load(Ordering::Relaxed); + let count2 = self.counter2.load(Ordering::Relaxed); + if count1 == count2 { + if let Some(fd) = self.fd1.read().unwrap().as_ref() { + fd.read(offset, buf) + } else if let Some(fd) = self.fd2.read().unwrap().as_ref() { + fd.read(offset, buf) + } else { + panic!("Both fd1 and fd2 are None"); + } + } else if count1 > count2 { self.fd1.read().unwrap().as_ref().unwrap().read(offset, buf) } else { self.fd2.read().unwrap().as_ref().unwrap().read(offset, buf) diff --git a/src/env/mod.rs b/src/env/mod.rs index 3d47e095..4f7c85f7 100644 --- a/src/env/mod.rs +++ b/src/env/mod.rs @@ -9,8 +9,8 @@ mod double_write; mod obfuscated; pub use default::DefaultFileSystem; -pub use obfuscated::ObfuscatedFileSystem; pub use double_write::DoubleWriteFileSystem; +pub use obfuscated::ObfuscatedFileSystem; #[derive(Clone, Copy, PartialEq, Eq, Debug)] pub enum Permission { diff --git a/tests/failpoints/test_engine.rs b/tests/failpoints/test_engine.rs index a2700532..ca5b4306 100644 --- a/tests/failpoints/test_engine.rs +++ b/tests/failpoints/test_engine.rs @@ -1,5 +1,7 @@ // Copyright (c) 2017-present, PingCAP, Inc. Licensed under Apache-2.0. +use raft_engine::env::DoubleWriteFileSystem; +use std::path::Path; use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; use std::sync::{Arc, Barrier}; use std::time::Duration; @@ -1186,3 +1188,102 @@ fn test_build_engine_with_recycling_and_multi_dirs() { ); } } + +#[test] +fn test_start_engine_with_slow_second_disk() { + let dir = tempfile::Builder::new() + .prefix("test_start_engine_with_slow_second_disk_default") + .tempdir() + .unwrap(); + let sec_dir = tempfile::Builder::new() + .prefix("test_start_engine_with_slow_second_disk_second") + .tempdir() + .unwrap(); + fn number_of_files(p: &Path) -> usize { + let mut r = 0; + std::fs::read_dir(p).unwrap().for_each(|e| { + if e.unwrap() + .path() + .file_name() + .unwrap() + .to_str() + .unwrap() + .starts_with("000") + { + r += 1; + } + }); + r + } + fail::cfg("double_write::thread1", "pause").unwrap(); + let file_system = Arc::new(DoubleWriteFileSystem::new( + dir.path().to_path_buf(), + sec_dir.path().to_path_buf(), + )); + let entry_data = vec![b'x'; 512]; + + // Preparations for multi-dirs. + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + enable_log_recycle: false, + target_file_size: ReadableSize(1), + ..Default::default() + }; + + // Step 1: write data into the main directory. + let engine = Engine::open_with_file_system(cfg.clone(), file_system.clone()).unwrap(); + for rid in 1..=10 { + append(&engine, rid, 1, 10, Some(&entry_data)); + } + for rid in 1..=10 { + assert_eq!(engine.first_index(rid).unwrap(), 1); + } + assert_ne!(number_of_files(sec_dir.path()), number_of_files(dir.path())); + fail::remove("double_write::thread1"); + drop(engine); + assert_eq!(number_of_files(sec_dir.path()), number_of_files(dir.path())); + + // Restart the engine with recycle and prefill. Test reusing files from both + // dirs. + let cfg_2 = Config { + enable_log_recycle: true, + prefill_for_recycle: true, + purge_threshold: ReadableSize(40), + ..cfg.clone() + }; + let engine = Engine::open_with_file_system(cfg_2, file_system.clone()).unwrap(); + assert_eq!(number_of_files(sec_dir.path()), number_of_files(dir.path())); + for rid in 1..=10 { + assert_eq!(engine.first_index(rid).unwrap(), 1); + let mut log_batch = LogBatch::default(); + log_batch.add_command(rid, Command::Clean); + engine.write(&mut log_batch, true).unwrap(); + } + assert_eq!(number_of_files(sec_dir.path()), number_of_files(dir.path())); + fail::cfg("double_write::thread1", "pause").unwrap(); + engine.purge_manager().must_rewrite_append_queue(None, None); + assert_ne!(number_of_files(sec_dir.path()), number_of_files(dir.path())); + fail::remove("double_write::thread1"); + + let mut times = 0; + loop { + if number_of_files(sec_dir.path()) == number_of_files(dir.path()) { + break; + } + if times > 50 { + panic!("rewrite queue is not finished"); + } + times += 1; + std::thread::sleep(Duration::from_millis(10)); + } + + let file_count = number_of_files(dir.path()); + // Append data, recycled files are reused. + for rid in 1..=30 { + append(&engine, rid, 20, 30, Some(&entry_data)); + } + // No new file is created. + let file_count1 = number_of_files(dir.path()); + assert_eq!(file_count, file_count1); + assert_eq!(number_of_files(sec_dir.path()), file_count1); +} From 8e5edc5989cbfca25b3b709109622735cd6e7023 Mon Sep 17 00:00:00 2001 From: Connor1996 Date: Wed, 19 Jul 2023 15:28:22 -0700 Subject: [PATCH 04/32] rename Signed-off-by: Connor1996 --- src/engine.rs | 2 +- src/env/double_write.rs | 61 ++++++++++++++++----------------- src/env/mod.rs | 2 +- tests/failpoints/test_engine.rs | 4 +-- 4 files changed, 34 insertions(+), 35 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index be82c6da..ca965f82 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -2623,7 +2623,7 @@ pub(crate) mod tests { }); r } - let file_system = Arc::new(DoubleWriteFileSystem::new( + let file_system = Arc::new(HedgedFileSystem::new( dir.path().to_path_buf(), sec_dir.path().to_path_buf(), )); diff --git a/src/env/double_write.rs b/src/env/double_write.rs index 268b945f..6dd1abff 100644 --- a/src/env/double_write.rs +++ b/src/env/double_write.rs @@ -45,7 +45,7 @@ enum Task { Stop, } -pub struct DoubleWriteFileSystem { +pub struct HedgedFileSystem { path1: PathBuf, path2: PathBuf, disk1: Sender<(Task, Callback)>, @@ -55,7 +55,7 @@ pub struct DoubleWriteFileSystem { handle2: Option>, } -impl DoubleWriteFileSystem { +impl HedgedFileSystem { pub fn new(path1: PathBuf, path2: PathBuf) -> Self { // TODO: fix unsynced files // assume they are synced now. @@ -171,16 +171,16 @@ impl DoubleWriteFileSystem { } } - async fn wait_handle(&self, task1: Task, task2: Task) -> IoResult { + async fn wait_handle(&self, task1: Task, task2: Task) -> IoResult { let (cb1, mut f1) = paired_future_callback(); let (cb2, mut f2) = paired_future_callback(); self.disk1.send((task1, cb1)).unwrap(); self.disk2.send((task2, cb2)).unwrap(); select! { - res1 = f1 => res1.unwrap().map(|h| DoubleWriteHandle::new( + res1 = f1 => res1.unwrap().map(|h| HedgedHandle::new( Either::Right(h.unwrap()), Either::Left(f2) )), - res2 = f2 => res2.unwrap().map(|h| DoubleWriteHandle::new( + res2 = f2 => res2.unwrap().map(|h| HedgedHandle::new( Either::Left(f1), Either::Right(h.unwrap()) )), } } @@ -219,7 +219,7 @@ impl DoubleWriteFileSystem { } } -impl Drop for DoubleWriteFileSystem { +impl Drop for HedgedFileSystem { fn drop(&mut self) { self.disk1.send((Task::Stop, Box::new(|_| {}))).unwrap(); self.disk2.send((Task::Stop, Box::new(|_| {}))).unwrap(); @@ -228,10 +228,10 @@ impl Drop for DoubleWriteFileSystem { } } -impl FileSystem for DoubleWriteFileSystem { - type Handle = DoubleWriteHandle; - type Reader = DoubleWriteReader; - type Writer = DoubleWriteWriter; +impl FileSystem for HedgedFileSystem { + type Handle = HedgedHandle; + type Reader = HedgedReader; + type Writer = HedgedWriter; fn create>(&self, path: P) -> IoResult { block_on(self.wait_handle( @@ -274,11 +274,11 @@ impl FileSystem for DoubleWriteFileSystem { } fn new_reader(&self, handle: Arc) -> IoResult { - Ok(DoubleWriteReader::new(handle)) + Ok(HedgedReader::new(handle)) } fn new_writer(&self, handle: Arc) -> IoResult { - Ok(DoubleWriteWriter::new(handle)) + Ok(HedgedWriter::new(handle)) } } @@ -292,7 +292,7 @@ enum FileTask { Stop, } -pub struct DoubleWriteHandle { +pub struct HedgedHandle { disk1: Sender<(FileTask, Callback)>, disk2: Sender<(FileTask, Callback)>, counter1: Arc, @@ -304,7 +304,7 @@ pub struct DoubleWriteHandle { handle2: Option>, } -impl DoubleWriteHandle { +impl HedgedHandle { pub fn new( file1: Either>>, LogFd>, file2: Either>>, LogFd>, @@ -426,7 +426,7 @@ impl DoubleWriteHandle { } } -impl Drop for DoubleWriteHandle { +impl Drop for HedgedHandle { fn drop(&mut self) { self.disk1.send((FileTask::Stop, Box::new(|_| {}))).unwrap(); self.disk2.send((FileTask::Stop, Box::new(|_| {}))).unwrap(); @@ -435,7 +435,7 @@ impl Drop for DoubleWriteHandle { } } -impl Handle for DoubleWriteHandle { +impl Handle for HedgedHandle { fn truncate(&self, offset: usize) -> IoResult<()> { block_on(self.wait_one(FileTask::Truncate(offset))).map(|_| ()) } @@ -449,13 +449,13 @@ impl Handle for DoubleWriteHandle { } } -pub struct DoubleWriteWriter { - inner: Arc, +pub struct HedgedWriter { + inner: Arc, offset: usize, } -impl DoubleWriteWriter { - pub fn new(handle: Arc) -> Self { +impl HedgedWriter { + pub fn new(handle: Arc) -> Self { Self { inner: handle, offset: 0, @@ -463,7 +463,7 @@ impl DoubleWriteWriter { } } -impl Write for DoubleWriteWriter { +impl Write for HedgedWriter { fn write(&mut self, buf: &[u8]) -> IoResult { let len = self.inner.write(self.offset, buf)?; self.offset += len; @@ -475,7 +475,7 @@ impl Write for DoubleWriteWriter { } } -impl WriteExt for DoubleWriteWriter { +impl WriteExt for HedgedWriter { fn truncate(&mut self, offset: usize) -> IoResult<()> { self.inner.truncate(offset)?; self.offset = offset; @@ -487,7 +487,7 @@ impl WriteExt for DoubleWriteWriter { } } -impl Seek for DoubleWriteWriter { +impl Seek for HedgedWriter { fn seek(&mut self, pos: SeekFrom) -> IoResult { match pos { SeekFrom::Start(offset) => self.offset = offset as usize, @@ -498,13 +498,13 @@ impl Seek for DoubleWriteWriter { } } -pub struct DoubleWriteReader { - inner: Arc, +pub struct HedgedReader { + inner: Arc, offset: usize, } -impl DoubleWriteReader { - pub fn new(handle: Arc) -> Self { +impl HedgedReader { + pub fn new(handle: Arc) -> Self { Self { inner: handle, offset: 0, @@ -512,7 +512,7 @@ impl DoubleWriteReader { } } -impl Seek for DoubleWriteReader { +impl Seek for HedgedReader { fn seek(&mut self, pos: SeekFrom) -> IoResult { match pos { SeekFrom::Start(offset) => self.offset = offset as usize, @@ -523,7 +523,7 @@ impl Seek for DoubleWriteReader { } } -impl Read for DoubleWriteReader { +impl Read for HedgedReader { fn read(&mut self, buf: &mut [u8]) -> IoResult { let len = self.inner.read(self.offset, buf)?; self.offset += len; @@ -531,8 +531,7 @@ impl Read for DoubleWriteReader { } } -pub fn paired_future_callback( -) -> (Callback, oneshot::Receiver>>) { +pub fn paired_future_callback() -> (Callback, oneshot::Receiver>>) { let (tx, future) = oneshot::channel(); let callback = Box::new(move |result| { let r = tx.send(result); diff --git a/src/env/mod.rs b/src/env/mod.rs index 4f7c85f7..a187c534 100644 --- a/src/env/mod.rs +++ b/src/env/mod.rs @@ -9,7 +9,7 @@ mod double_write; mod obfuscated; pub use default::DefaultFileSystem; -pub use double_write::DoubleWriteFileSystem; +pub use double_write::HedgedFileSystem; pub use obfuscated::ObfuscatedFileSystem; #[derive(Clone, Copy, PartialEq, Eq, Debug)] diff --git a/tests/failpoints/test_engine.rs b/tests/failpoints/test_engine.rs index ca5b4306..ac7f7827 100644 --- a/tests/failpoints/test_engine.rs +++ b/tests/failpoints/test_engine.rs @@ -1,6 +1,6 @@ // Copyright (c) 2017-present, PingCAP, Inc. Licensed under Apache-2.0. -use raft_engine::env::DoubleWriteFileSystem; +use raft_engine::env::HedgedFileSystem; use std::path::Path; use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; use std::sync::{Arc, Barrier}; @@ -1216,7 +1216,7 @@ fn test_start_engine_with_slow_second_disk() { r } fail::cfg("double_write::thread1", "pause").unwrap(); - let file_system = Arc::new(DoubleWriteFileSystem::new( + let file_system = Arc::new(HedgedFileSystem::new( dir.path().to_path_buf(), sec_dir.path().to_path_buf(), )); From 62a904084467a0a1eacc42e1e1f1a9799381937d Mon Sep 17 00:00:00 2001 From: Connor1996 Date: Tue, 8 Aug 2023 17:31:18 -0700 Subject: [PATCH 05/32] add test Signed-off-by: Connor1996 --- src/config.rs | 5 +- src/engine.rs | 181 +++++++++++---- src/env/double_write.rs | 360 +++++++++++++++++++++--------- src/file_pipe_log/log_file.rs | 2 +- src/file_pipe_log/mod.rs | 8 +- src/file_pipe_log/pipe_builder.rs | 6 +- src/file_pipe_log/reader.rs | 2 +- src/swappy_allocator.rs | 2 +- tests/failpoints/test_engine.rs | 38 ++-- 9 files changed, 433 insertions(+), 171 deletions(-) diff --git a/src/config.rs b/src/config.rs index fd467571..3b5036be 100644 --- a/src/config.rs +++ b/src/config.rs @@ -40,6 +40,8 @@ pub struct Config { /// Default: None pub spill_dir: Option, + pub second_dir: Option, + /// How to deal with file corruption during recovery. /// /// Default: "tolerate-tail-corruption". @@ -104,7 +106,7 @@ pub struct Config { /// Whether to prepare log files for recycling when start. /// If `true`, batch empty log files will be prepared for recycling when /// starting engine. - /// Only available for `enable-log-reycle` is true. + /// Only available for `enable-log-recycle` is true. /// /// Default: false pub prefill_for_recycle: bool, @@ -123,6 +125,7 @@ impl Default for Config { let mut cfg = Config { dir: "".to_owned(), spill_dir: None, + second_dir: None, recovery_mode: RecoveryMode::TolerateTailCorruption, recovery_read_block_size: ReadableSize::kb(16), recovery_threads: 4, diff --git a/src/engine.rs b/src/engine.rs index ca965f82..a5987f2f 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -10,7 +10,6 @@ use std::time::{Duration, Instant}; use log::{error, info}; use protobuf::{parse_from_bytes, Message}; -use crate::config::{Config, RecoveryMode}; use crate::consistency::ConsistencyChecker; use crate::env::{DefaultFileSystem, FileSystem}; use crate::event_listener::EventListener; @@ -22,6 +21,10 @@ use crate::metrics::*; use crate::pipe_log::{FileBlockHandle, FileId, LogQueue, PipeLog}; use crate::purge::{PurgeHook, PurgeManager}; use crate::write_barrier::{WriteBarrier, Writer}; +use crate::{ + config::{Config, RecoveryMode}, + env::HedgedFileSystem, +}; use crate::{perf_context, Error, GlobalStats, Result}; const METRICS_FLUSH_INTERVAL: Duration = Duration::from_secs(30); @@ -69,8 +72,13 @@ where { pub fn open_with_file_system( cfg: Config, - file_system: Arc, + mut file_system: Arc, ) -> Result>> { + file_system = if let Some(sec_dir) = cfg.second_dir { + let fs = Arc::new(HedgedFileSystem::new(file_system, cfg.dir.into(), sec_dir.into())); + fs.bootstrap()?; + fs + }; Self::open_with(cfg, file_system, vec![]) } @@ -623,7 +631,7 @@ where #[cfg(test)] pub(crate) mod tests { use super::*; - use crate::env::{ObfuscatedFileSystem, Permission}; + use crate::env::{HedgedFileSystem, ObfuscatedFileSystem, Permission}; use crate::file_pipe_log::{parse_reserved_file_name, FileNameExt}; use crate::log_batch::AtomicGroupBuilder; use crate::pipe_log::Version; @@ -2597,33 +2605,36 @@ pub(crate) mod tests { assert!(engine.raft_groups().is_empty()); } + fn number_of_files(p: &Path) -> usize { + let mut r = 0; + std::fs::read_dir(p).unwrap().for_each(|e| { + if e.unwrap() + .path() + .file_name() + .unwrap() + .to_str() + .unwrap() + .starts_with("000") + { + r += 1; + } + }); + r + } + #[test] fn test_start_engine_with_second_disk() { let dir = tempfile::Builder::new() - .prefix("test_start_engine_with_multi_dirs_default") + .prefix("test_start_engine_with_second_disk_default") .tempdir() .unwrap(); let sec_dir = tempfile::Builder::new() - .prefix("test_start_engine_with_multi_dirs_second") + .prefix("test_start_engine_with_second_disk_second") .tempdir() .unwrap(); - fn number_of_files(p: &Path) -> usize { - let mut r = 0; - std::fs::read_dir(p).unwrap().for_each(|e| { - if e.unwrap() - .path() - .file_name() - .unwrap() - .to_str() - .unwrap() - .starts_with("000") - { - r += 1; - } - }); - r - } + let file_system = Arc::new(HedgedFileSystem::new( + Arc::new(DefaultFileSystem {}), dir.path().to_path_buf(), sec_dir.path().to_path_buf(), )); @@ -2651,9 +2662,62 @@ pub(crate) mod tests { enable_log_recycle: true, prefill_for_recycle: true, purge_threshold: ReadableSize(40), - ..cfg.clone() + ..cfg }; - let engine = RaftLogEngine::open_with_file_system(cfg_2, file_system.clone()).unwrap(); + let engine = RaftLogEngine::open_with_file_system(cfg_2, file_system).unwrap(); + assert_eq!(number_of_files(sec_dir.path()), number_of_files(dir.path())); + for rid in 1..=10 { + assert_eq!(engine.first_index(rid).unwrap(), 1); + engine.clean(rid); + } + engine.purge_manager.must_rewrite_append_queue(None, None); + let file_count = number_of_files(dir.path()); + assert_eq!(number_of_files(sec_dir.path()), file_count); + assert!(file_count > engine.file_count(None)); + // Append data, recycled files are reused. + for rid in 1..=30 { + engine.append(rid, 20, 30, Some(&entry_data)); + } + // No new file is created. + let file_count1 = number_of_files(dir.path()); + assert_eq!(file_count, file_count1); + assert_eq!(number_of_files(sec_dir.path()), file_count1); + } + + #[test] + fn test_start_engine_with_abnormal_second_disk() { + let dir = tempfile::Builder::new() + .prefix("test_start_engine_with_abnormal_second_disk_default") + .tempdir() + .unwrap(); + let sec_dir = tempfile::Builder::new() + .prefix("test_start_engine_with_abnormal_second_disk_second") + .tempdir() + .unwrap(); + + let file_system = Arc::new(HedgedFileSystem::new( + Arc::new(DefaultFileSystem {}), + dir.path().to_path_buf(), + sec_dir.path().to_path_buf(), + )); + let entry_data = vec![b'x'; 512]; + + // Preparations for multi-dirs. + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + enable_log_recycle: true, + prefill_for_recycle: true, + target_file_size: ReadableSize(1), + purge_threshold: ReadableSize(40), + ..Default::default() + }; + + // Step 1: write data into the main directory. + let engine = + RaftLogEngine::open_with_file_system(cfg.clone(), file_system.clone()).unwrap(); + for rid in 1..=10 { + engine.append(rid, 1, 10, Some(&entry_data)); + } assert_eq!(number_of_files(sec_dir.path()), number_of_files(dir.path())); for rid in 1..=10 { assert_eq!(engine.first_index(rid).unwrap(), 1); @@ -2671,6 +2735,60 @@ pub(crate) mod tests { let file_count1 = number_of_files(dir.path()); assert_eq!(file_count, file_count1); assert_eq!(number_of_files(sec_dir.path()), file_count1); + drop(engine); + + // abnormal case - Empty second dir + { + std::fs::remove_dir_all(sec_dir.path()).unwrap(); + let engine = + RaftLogEngine::open_with_file_system(cfg.clone(), file_system.clone()).unwrap(); + assert_eq!(number_of_files(sec_dir.path()), number_of_files(dir.path())); + } + // abnormal case - Missing some append files in second dir + { + let mut file_count = 0; + for e in std::fs::read_dir(sec_dir.path()).unwrap() { + let p = e.unwrap().path(); + let file_name = p.file_name().unwrap().to_str().unwrap(); + println!("file_name: {}", file_name); + if let Some(FileId { + queue: LogQueue::Append, + seq: _, + }) = FileId::parse_file_name(file_name) + { + if file_count % 2 == 0 { + std::fs::remove_file(dir.path().join(file_name)).unwrap(); + } + file_count += 1; + } + } + let engine = RaftLogEngine::open_with_file_system(cfg, file_system).unwrap(); + assert_eq!(number_of_files(sec_dir.path()), number_of_files(dir.path())); + } + // abnormal case - Missing some rewrite files in second dir + { + let mut file_count = 0; + for e in std::fs::read_dir(sec_dir.path()).unwrap() { + let p = e.unwrap().path(); + let file_name = p.file_name().unwrap().to_str().unwrap(); + println!("file_name: {}", file_name); + if let Some(FileId { + queue: LogQueue::Rewrite, + seq: _, + }) = FileId::parse_file_name(file_name) + { + if file_count % 2 == 0 { + std::fs::remove_file(dir.path().join(file_name)).unwrap(); + } + file_count += 1; + } + } + let engine = RaftLogEngine::open_with_file_system(cfg, file_system).unwrap(); + assert_eq!(number_of_files(sec_dir.path()), number_of_files(dir.path())); + } + // abnormal case - Missing some reserve files in second dir + // abnormal case - Have some extra files in second dir + // abnormal case - One file is corrupted } #[test] @@ -2683,22 +2801,7 @@ pub(crate) mod tests { .prefix("test_start_engine_with_multi_dirs_spill") .tempdir() .unwrap(); - fn number_of_files(p: &Path) -> usize { - let mut r = 0; - std::fs::read_dir(p).unwrap().for_each(|e| { - if e.unwrap() - .path() - .file_name() - .unwrap() - .to_str() - .unwrap() - .starts_with("000") - { - r += 1; - } - }); - r - } + let file_system = Arc::new(DeleteMonitoredFileSystem::new()); let entry_data = vec![b'x'; 512]; diff --git a/src/env/double_write.rs b/src/env/double_write.rs index 6dd1abff..a444a01e 100644 --- a/src/env/double_write.rs +++ b/src/env/double_write.rs @@ -1,14 +1,18 @@ // Copyright (c) 2017-present, PingCAP, Inc. Licensed under Apache-2.0. +use crate::file_pipe_log::log_file::build_file_reader; +use crate::file_pipe_log::pipe::File; +use crate::file_pipe_log::pipe_builder::FileName; +use crate::file_pipe_log::reader::LogItemBatchFileReader; use crate::file_pipe_log::FileNameExt; use crate::internals::parse_reserved_file_name; use crate::internals::FileId; -use crate::internals::FileSeq; use crate::internals::LogQueue; +use crate::{Error, Result}; use crossbeam::channel::unbounded; use crossbeam::channel::Sender; use fail::fail_point; -use log::{info, warn}; +use log::{info, warn, Log}; use std::fs; use std::io::{Read, Result as IoResult, Seek, SeekFrom, Write}; use std::path::Path; @@ -45,7 +49,25 @@ enum Task { Stop, } -pub struct HedgedFileSystem { +#[derive(Default)] +struct Files { + prefix: PathBuf, + append_file: Vec, + rewrite_file: Vec, + recycled_file: Vec, +} + +fn replace_path(path: &Path, from: &Path, to: &Path) -> PathBuf { + if let Ok(file) = path.strip_prefix(from) { + to.to_path_buf().join(file) + } else { + panic!("Invalid path: {:?}", path); + } +} + +pub struct HedgedFileSystem { + inner: Arc, + path1: PathBuf, path2: PathBuf, disk1: Sender<(Task, Callback)>, @@ -55,36 +77,37 @@ pub struct HedgedFileSystem { handle2: Option>, } -impl HedgedFileSystem { - pub fn new(path1: PathBuf, path2: PathBuf) -> Self { - // TODO: fix unsynced files - // assume they are synced now. - Self::check_sync(&path1, &path2); +// TODO: read both dir at recovery, maybe no need? cause operations are to both +// disks TODO: consider encryption +impl HedgedFileSystem { + pub fn new(inner: Arc, path1: PathBuf, path2: PathBuf) -> Self { let (tx1, rx1) = unbounded::<(Task, Callback)>(); let (tx2, rx2) = unbounded::<(Task, Callback)>(); + let fs1 = inner.clone(); let handle1 = thread::spawn(|| { - let fs = DefaultFileSystem {}; for (task, cb) in rx1 { if task == Task::Stop { break; } fail_point!("double_write::thread1"); - let res = Self::handle(&fs, task); + let res = Self::handle(&fs1, task); cb(res); } }); + let fs2 = inner.clone(); let handle2 = thread::spawn(|| { let fs = DefaultFileSystem {}; for (task, cb) in rx2 { if task == Task::Stop { break; } - let res = Self::handle(&fs, task); + let res = Self::handle(&fs2, task); cb(res); } }); Self { + inner, path1, path2, disk1: tx1, @@ -94,81 +117,207 @@ impl HedgedFileSystem { } } - fn check_sync(path1: &PathBuf, path2: &PathBuf) { - let check = |path: &PathBuf| -> (Vec, Vec, Vec) { - let mut append_file_names = vec![]; - let mut rewrite_file_names = vec![]; - let mut recycled_file_names = vec![]; - if !path.exists() { - info!("Create raft log directory: {}", path.display()); - fs::create_dir(path).unwrap(); + pub fn bootstrap(&self) -> Result<()> { + // catch up diff + let files1 = self.get_files(&self.path1)?; + let files2 = self.get_files(&self.path2)?; + + let count1 = self.get_latest_valid_seq(&files1)?; + let count2 = self.get_latest_valid_seq(&files2)?; + + match count1.cmp(&count2) { + std::cmp::Ordering::Equal => { + // TODO: still need to catch up + return Ok(()); } + std::cmp::Ordering::Less => { + self.catch_up_diff(files2, files1)?; + } + std::cmp::Ordering::Greater => { + self.catch_up_diff(files1, files2)?; + } + } + Ok(()) + } - fs::read_dir(path) - .unwrap() - .try_for_each(|e| -> IoResult<()> { - let dir_entry = e?; - let p = dir_entry.path(); - if !p.is_file() { - return Ok(()); + fn catch_up_diff(&self, fromFiles: Files, toFiles: Files) -> Result<()> { + let check_files = |from: &Vec, to: &Vec| -> IoResult<()> { + let mut iter1 = from.iter().peekable(); + let mut iter2 = to.iter().peekable(); + // compare files of from and to, if the file in from is not in to, copy it to + // to, and if the file in to is not in from, delete it + loop { + match (iter1.peek(), iter2.peek()) { + (None, None) => break, + (Some(f1), None) => { + let to = replace_path( + f1.path.as_ref(), + fromFiles.prefix.as_ref(), + toFiles.prefix.as_ref(), + ); + fs::copy(&f1.path, to)?; + iter1.next(); + } + (None, Some(f2)) => { + fs::remove_file(&f2.path)?; + iter2.next(); } - let file_name = p.file_name().unwrap().to_str().unwrap(); - match FileId::parse_file_name(file_name) { - Some(FileId { - queue: LogQueue::Append, - seq, - }) => append_file_names.push(seq), - Some(FileId { - queue: LogQueue::Rewrite, - seq, - }) => rewrite_file_names.push(seq), - _ => { - if let Some(seq) = parse_reserved_file_name(file_name) { - recycled_file_names.push(seq); + (Some(f1), Some(f2)) => { + match f1.seq.cmp(&f2.seq) { + std::cmp::Ordering::Equal => { + // TODO: do we need to check file size? + // if f1.handle.file_size() != f2.handle.file_size() { + // let to = replace_path(f1.path.as_ref(), + // fromFiles.prefix.as_ref(), toFiles.prefix.as_ref()); + // fs::copy(&f1.path, &to)?; + // } + iter1.next(); + iter2.next(); + } + std::cmp::Ordering::Less => { + let to = replace_path( + f1.path.as_ref(), + fromFiles.prefix.as_ref(), + toFiles.prefix.as_ref(), + ); + fs::copy(&f1.path, to)?; + iter1.next(); + } + std::cmp::Ordering::Greater => { + fs::remove_file(&f2.path)?; + iter2.next(); } } } - Ok(()) - }) - .unwrap(); - append_file_names.sort(); - rewrite_file_names.sort(); - recycled_file_names.sort(); - (append_file_names, rewrite_file_names, recycled_file_names) + } + } + Ok(()) }; - let (append1, rewrite1, recycle1) = check(path1); - let (append2, rewrite2, recycle2) = check(path2); - if append1.first() != append2.first() { - panic!("Append file seq not match: {:?} vs {:?}", append1, append2); - } - if append1.last() != append2.last() { - panic!("Append file seq not match: {:?} vs {:?}", append1, append2); - } - if rewrite1.first() != rewrite2.first() { - panic!( - "Rewrite file seq not match: {:?} vs {:?}", - rewrite1, rewrite2 + check_files(&fromFiles.append_file, &toFiles.append_file)?; + check_files(&fromFiles.rewrite_file, &toFiles.rewrite_file)?; + check_files(&fromFiles.recycled_file, &toFiles.recycled_file)?; + + // check file size is not enough, treat the last files differently considering + // the recycle, always copy the last file + if let Some(last_file) = fromFiles.append_file.last() { + let to = replace_path( + last_file.path.as_ref(), + fromFiles.prefix.as_ref(), + toFiles.prefix.as_ref(), ); + fs::copy(&last_file.path, to)?; } - if rewrite1.last() != rewrite2.last() { - panic!( - "Rewrite file seq not match: {:?} vs {:?}", - rewrite1, rewrite2 + if let Some(last_file) = fromFiles.rewrite_file.last() { + let to = replace_path( + last_file.path.as_ref(), + fromFiles.prefix.as_ref(), + toFiles.prefix.as_ref(), ); + fs::copy(&last_file.path, to)?; } - if recycle1.first() != recycle2.first() { - panic!( - "Recycle file seq not match: {:?} vs {:?}", - recycle1, recycle2 + if let Some(last_file) = fromFiles.recycled_file.last() { + let to = replace_path( + last_file.path.as_ref(), + fromFiles.prefix.as_ref(), + toFiles.prefix.as_ref(), ); + fs::copy(&last_file.path, to)?; } - if recycle1.last() != recycle2.last() { - panic!( - "Recycle file seq not match: {:?} vs {:?}", - recycle1, recycle2 - ); + + Ok(()) + } + + fn get_files(&self, path: &PathBuf) -> IoResult { + let mut files = Files { + prefix: path.clone(), + ..Default::default() + }; + if !path.exists() { + info!("Create raft log directory: {}", path.display()); + fs::create_dir(path).unwrap(); } + + fs::read_dir(path) + .unwrap() + .try_for_each(|e| -> IoResult<()> { + let dir_entry = e?; + let p = dir_entry.path(); + if !p.is_file() { + return Ok(()); + } + let file_name = p.file_name().unwrap().to_str().unwrap(); + match FileId::parse_file_name(file_name) { + Some(FileId { + queue: LogQueue::Append, + seq, + }) => files.append_file.push(FileName { + seq, + path: p, + path_id: 0, + }), + Some(FileId { + queue: LogQueue::Rewrite, + seq, + }) => files.rewrite_file.push(FileName { + seq, + path: p, + path_id: 0, + }), + _ => { + if let Some(seq) = parse_reserved_file_name(file_name) { + files.recycled_file.push(FileName { + seq, + path: p, + path_id: 0, + }) + } + } + } + Ok(()) + }) + .unwrap(); + files.append_file.sort_by(|a, b| a.seq.cmp(&b.seq)); + files.rewrite_file.sort_by(|a, b| a.seq.cmp(&b.seq)); + files.recycled_file.sort_by(|a, b| a.seq.cmp(&b.seq)); + Ok(files) + } + + fn get_latest_valid_seq(&self, files: &Files) -> Result { + let mut count = 0; + if let Some(f) = files.append_file.last() { + let recovery_read_block_size = 1024; + let mut reader = LogItemBatchFileReader::new(recovery_read_block_size); + // TODO: change file system + let handle = Arc::new(DefaultFileSystem {}.open(&f.path, Permission::ReadOnly)?); + let file_reader = build_file_reader(&DefaultFileSystem {}, handle)?; + match reader.open( + FileId { + queue: LogQueue::Append, + seq: f.seq, + }, + file_reader, + ) { + Err(e) if matches!(e, Error::Io(_)) => return Err(e), + Err(e) => { + return Ok(0); + } + Ok(format) => { + // Do nothing + } + } + loop { + match reader.next() { + Ok(Some(item_batch)) => { + count += 1; + } + Ok(None) => break, + Err(e) => break, + } + } + } + + Ok(count) } async fn wait_handle(&self, task1: Task, task2: Task) -> IoResult { @@ -197,19 +346,11 @@ impl HedgedFileSystem { } } - fn replace_path(&self, path: &Path) -> PathBuf { - if let Ok(file) = path.strip_prefix(&self.path1) { - self.path2.clone().join(file) - } else { - panic!("Invalid path: {:?}", path); - } - } - #[inline] - fn handle(file_system: &DefaultFileSystem, task: Task) -> IoResult> { + fn handle(file_system: &F, task: Task) -> IoResult> { match task { - Task::Create(path) => file_system.create(path).map(|h| Some(h)), - Task::Open { path, perm } => file_system.open(path, perm).map(|h| Some(h)), + Task::Create(path) => file_system.create(path).map(Some), + Task::Open { path, perm } => file_system.open(path, perm).map(Some), Task::Delete(path) => file_system.delete(path).map(|_| None), Task::Rename { src_path, dst_path } => { file_system.rename(src_path, dst_path).map(|_| None) @@ -219,7 +360,7 @@ impl HedgedFileSystem { } } -impl Drop for HedgedFileSystem { +impl Drop for HedgedFileSystem { fn drop(&mut self) { self.disk1.send((Task::Stop, Box::new(|_| {}))).unwrap(); self.disk2.send((Task::Stop, Box::new(|_| {}))).unwrap(); @@ -228,7 +369,7 @@ impl Drop for HedgedFileSystem { } } -impl FileSystem for HedgedFileSystem { +impl FileSystem for HedgedFileSystem { type Handle = HedgedHandle; type Reader = HedgedReader; type Writer = HedgedWriter; @@ -236,7 +377,11 @@ impl FileSystem for HedgedFileSystem { fn create>(&self, path: P) -> IoResult { block_on(self.wait_handle( Task::Create(path.as_ref().to_path_buf()), - Task::Create(self.replace_path(path.as_ref())), + Task::Create(replace_path( + path.as_ref(), + self.path1.as_ref(), + self.path2.as_ref(), + )), )) } @@ -247,7 +392,7 @@ impl FileSystem for HedgedFileSystem { perm, }, Task::Open { - path: self.replace_path(path.as_ref()), + path: replace_path(path.as_ref(), self.path1.as_ref(), self.path2.as_ref()), perm, }, )) @@ -256,7 +401,11 @@ impl FileSystem for HedgedFileSystem { fn delete>(&self, path: P) -> IoResult<()> { block_on(self.wait_one( Task::Delete(path.as_ref().to_path_buf()), - Task::Delete(self.replace_path(path.as_ref())), + Task::Delete(replace_path( + path.as_ref(), + self.path1.as_ref(), + self.path2.as_ref(), + )), )) } @@ -267,8 +416,8 @@ impl FileSystem for HedgedFileSystem { dst_path: dst_path.as_ref().to_path_buf(), }, Task::Rename { - src_path: self.replace_path(src_path.as_ref()), - dst_path: self.replace_path(dst_path.as_ref()), + src_path: replace_path(src_path.as_ref(), self.path1.as_ref(), self.path2.as_ref()), + dst_path: replace_path(dst_path.as_ref(), self.path1.as_ref(), self.path2.as_ref()), }, )) } @@ -373,9 +522,9 @@ impl HedgedHandle { fn handle(fd: &LogFd, task: FileTask) -> IoResult> { match task { FileTask::Truncate(offset) => fd.truncate(offset).map(|_| None), - FileTask::FileSize => fd.file_size().map(|x| Some(x)), + FileTask::FileSize => fd.file_size().map(Some), FileTask::Sync => fd.sync().map(|_| None), - FileTask::Write { offset, bytes } => fd.write(offset, &bytes).map(|x| Some(x)), + FileTask::Write { offset, bytes } => fd.write(offset, &bytes).map(Some), FileTask::Allocate { offset, size } => fd.allocate(offset, size).map(|_| None), FileTask::Stop => unreachable!(), } @@ -386,18 +535,22 @@ impl HedgedHandle { // choose latest to perform read let count1 = self.counter1.load(Ordering::Relaxed); let count2 = self.counter2.load(Ordering::Relaxed); - if count1 == count2 { - if let Some(fd) = self.fd1.read().unwrap().as_ref() { - fd.read(offset, buf) - } else if let Some(fd) = self.fd2.read().unwrap().as_ref() { - fd.read(offset, buf) - } else { - panic!("Both fd1 and fd2 are None"); + match count1.cmp(&count2) { + std::cmp::Ordering::Equal => { + if let Some(fd) = self.fd1.read().unwrap().as_ref() { + fd.read(offset, buf) + } else if let Some(fd) = self.fd2.read().unwrap().as_ref() { + fd.read(offset, buf) + } else { + panic!("Both fd1 and fd2 are None"); + } + } + std::cmp::Ordering::Greater => { + self.fd1.read().unwrap().as_ref().unwrap().read(offset, buf) + } + std::cmp::Ordering::Less => { + self.fd2.read().unwrap().as_ref().unwrap().read(offset, buf) } - } else if count1 > count2 { - self.fd1.read().unwrap().as_ref().unwrap().read(offset, buf) - } else { - self.fd2.read().unwrap().as_ref().unwrap().read(offset, buf) } } @@ -531,7 +684,8 @@ impl Read for HedgedReader { } } -pub fn paired_future_callback() -> (Callback, oneshot::Receiver>>) { +pub fn paired_future_callback( +) -> (Callback, oneshot::Receiver>>) { let (tx, future) = oneshot::channel(); let callback = Box::new(move |result| { let r = tx.send(result); diff --git a/src/file_pipe_log/log_file.rs b/src/file_pipe_log/log_file.rs index 8ba92592..3e73e247 100644 --- a/src/file_pipe_log/log_file.rs +++ b/src/file_pipe_log/log_file.rs @@ -130,7 +130,7 @@ impl LogFileWriter { } /// Build a file reader. -pub(super) fn build_file_reader( +pub(crate) fn build_file_reader( system: &F, handle: Arc, ) -> Result> { diff --git a/src/file_pipe_log/mod.rs b/src/file_pipe_log/mod.rs index fd7c5036..0b036013 100644 --- a/src/file_pipe_log/mod.rs +++ b/src/file_pipe_log/mod.rs @@ -5,10 +5,10 @@ //! [`PipeLog`]: crate::pipe_log::PipeLog mod format; -mod log_file; -mod pipe; -mod pipe_builder; -mod reader; +pub mod log_file; +pub mod pipe; +pub mod pipe_builder; +pub mod reader; pub use format::{parse_reserved_file_name, FileNameExt}; pub use pipe::DualPipes as FilePipeLog; diff --git a/src/file_pipe_log/pipe_builder.rs b/src/file_pipe_log/pipe_builder.rs index 25ca433c..ca0c0a8a 100644 --- a/src/file_pipe_log/pipe_builder.rs +++ b/src/file_pipe_log/pipe_builder.rs @@ -349,8 +349,8 @@ impl DualPipesBuilder { let file_system = self.file_system.clone(); // As the `recover_queue` would update the `LogFileFormat` of each log file - // in `apend_files` and `rewrite_files`, we re-design the implementation on - // `recover_queue` to make it compatiable to concurrent processing + // in `append_files` and `rewrite_files`, we re-design the implementation on + // `recover_queue` to make it compatible to concurrent processing // with ThreadPool. let (append, rewrite) = pool.join( || { @@ -630,5 +630,5 @@ pub(super) fn lock_dir>(dir: P) -> Result { pub(crate) struct FileName { pub seq: FileSeq, pub path: PathBuf, - path_id: PathId, + pub path_id: PathId, } diff --git a/src/file_pipe_log/reader.rs b/src/file_pipe_log/reader.rs index 106ba72f..d19d9900 100644 --- a/src/file_pipe_log/reader.rs +++ b/src/file_pipe_log/reader.rs @@ -10,7 +10,7 @@ use super::format::{is_zero_padded, LogFileFormat}; use super::log_file::LogFileReader; /// A reusable reader over [`LogItemBatch`]s in a log file. -pub(super) struct LogItemBatchFileReader { +pub(crate) struct LogItemBatchFileReader { file_id: Option, format: Option, pub(crate) reader: Option>, diff --git a/src/swappy_allocator.rs b/src/swappy_allocator.rs index a6cacb21..9661dc4b 100644 --- a/src/swappy_allocator.rs +++ b/src/swappy_allocator.rs @@ -1023,7 +1023,7 @@ mod tests { // test_extend_ref let mut v = VecDeque::new_in(allocator.clone()); v.push_back(1); - v.extend(&[2, 3, 4]); + v.extend([2, 3, 4]); assert_eq!(v.len(), 4); assert_eq!(v[0], 1); diff --git a/tests/failpoints/test_engine.rs b/tests/failpoints/test_engine.rs index ac7f7827..44fe5a56 100644 --- a/tests/failpoints/test_engine.rs +++ b/tests/failpoints/test_engine.rs @@ -1189,6 +1189,23 @@ fn test_build_engine_with_recycling_and_multi_dirs() { } } +fn number_of_files(p: &Path) -> usize { + let mut r = 0; + std::fs::read_dir(p).unwrap().for_each(|e| { + if e.unwrap() + .path() + .file_name() + .unwrap() + .to_str() + .unwrap() + .starts_with("000") + { + r += 1; + } + }); + r +} + #[test] fn test_start_engine_with_slow_second_disk() { let dir = tempfile::Builder::new() @@ -1199,22 +1216,7 @@ fn test_start_engine_with_slow_second_disk() { .prefix("test_start_engine_with_slow_second_disk_second") .tempdir() .unwrap(); - fn number_of_files(p: &Path) -> usize { - let mut r = 0; - std::fs::read_dir(p).unwrap().for_each(|e| { - if e.unwrap() - .path() - .file_name() - .unwrap() - .to_str() - .unwrap() - .starts_with("000") - { - r += 1; - } - }); - r - } + fail::cfg("double_write::thread1", "pause").unwrap(); let file_system = Arc::new(HedgedFileSystem::new( dir.path().to_path_buf(), @@ -1249,9 +1251,9 @@ fn test_start_engine_with_slow_second_disk() { enable_log_recycle: true, prefill_for_recycle: true, purge_threshold: ReadableSize(40), - ..cfg.clone() + ..cfg }; - let engine = Engine::open_with_file_system(cfg_2, file_system.clone()).unwrap(); + let engine = Engine::open_with_file_system(cfg_2, file_system).unwrap(); assert_eq!(number_of_files(sec_dir.path()), number_of_files(dir.path())); for rid in 1..=10 { assert_eq!(engine.first_index(rid).unwrap(), 1); From 045ecbd006f709c67e9d3193e2f7186bdad52f29 Mon Sep 17 00:00:00 2001 From: Connor1996 Date: Tue, 22 Aug 2023 21:25:53 +0800 Subject: [PATCH 06/32] adjust Signed-off-by: Connor1996 --- src/engine.rs | 189 ++++++++++++++------------- src/env/default.rs | 1 + src/env/double_write.rs | 275 ++++++++++++++++++++++++++-------------- src/env/mod.rs | 2 +- 4 files changed, 282 insertions(+), 185 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index a5987f2f..04d98dcb 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -31,6 +31,30 @@ const METRICS_FLUSH_INTERVAL: Duration = Duration::from_secs(30); /// Max times for `write`. const MAX_WRITE_ATTEMPT: u64 = 2; +// pub struct HedgedEngine> +// where +// F: FileSystem, +// P: PipeLog, +// { +// inner: Engine, P>, +// fs: Arc>, +// } + +// impl HedgedEngine> +// where +// F: FileSystem, +// { + +// } + +// impl Deref for HedgedEngine> { +// type Target = Engine>; + +// fn deref(&self) -> &Self::Target { +// &self.inner +// } +// } + pub struct Engine> where F: FileSystem, @@ -66,23 +90,36 @@ impl Engine> { } } +pub fn open_with_hedged_file_system( + cfg: Config, + file_system: Arc, +) -> Result>> { + let file_system = if let Some(ref sec_dir) = cfg.second_dir { + let fs = Arc::new(HedgedFileSystem::new( + file_system, + cfg.dir.clone().into(), + sec_dir.clone().into(), + )); + fs.bootstrap()?; + fs + } else { + panic!() + }; + Engine::open_with(cfg, file_system, vec![]) +} + +pub fn open_with_file_system( + cfg: Config, + file_system: Arc, +) -> Result>> { + Engine::open_with(cfg, file_system, vec![]) +} + impl Engine> where F: FileSystem, { - pub fn open_with_file_system( - cfg: Config, - mut file_system: Arc, - ) -> Result>> { - file_system = if let Some(sec_dir) = cfg.second_dir { - let fs = Arc::new(HedgedFileSystem::new(file_system, cfg.dir.into(), sec_dir.into())); - fs.bootstrap()?; - fs - }; - Self::open_with(cfg, file_system, vec![]) - } - - pub fn open_with( + fn open_with( mut cfg: Config, file_system: Arc, mut listeners: Vec>, @@ -108,6 +145,9 @@ where stats.clone(), listeners.clone(), ); + // HedgingManager::new( + // pipe_log.clone(), + // ) let (tx, rx) = mpsc::channel(); let stats_clone = stats.clone(); @@ -743,8 +783,7 @@ pub(crate) mod tests { dir: sub_dir.to_str().unwrap().to_owned(), ..Default::default() }; - RaftLogEngine::open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())) - .unwrap(); + open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())).unwrap(); } #[test] @@ -762,11 +801,9 @@ pub(crate) mod tests { ..Default::default() }; - let engine = RaftLogEngine::open_with_file_system( - cfg.clone(), - Arc::new(ObfuscatedFileSystem::default()), - ) - .unwrap(); + let engine = + open_with_file_system(cfg.clone(), Arc::new(ObfuscatedFileSystem::default())) + .unwrap(); assert_eq!(engine.path(), dir.path().to_str().unwrap()); let data = vec![b'x'; entry_size]; for i in 10..20 { @@ -816,7 +853,7 @@ pub(crate) mod tests { target_file_size: ReadableSize(1), ..Default::default() }; - let engine = RaftLogEngine::open_with_file_system( + let engine = open_with_file_system( cfg.clone(), Arc::new(ObfuscatedFileSystem::default()), ) @@ -889,9 +926,7 @@ pub(crate) mod tests { ..Default::default() }; let rid = 1; - let engine = - RaftLogEngine::open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())) - .unwrap(); + let engine = open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())).unwrap(); engine .scan_messages::(rid, None, None, false, |_, _| { @@ -978,9 +1013,7 @@ pub(crate) mod tests { let mut delete_batch = LogBatch::default(); delete_batch.delete(rid, key.clone()); - let engine = - RaftLogEngine::open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())) - .unwrap(); + let engine = open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())).unwrap(); assert_eq!( engine.get_message::(rid, &key).unwrap(), None @@ -1089,9 +1122,7 @@ pub(crate) mod tests { target_file_size: ReadableSize(1), ..Default::default() }; - let engine = - RaftLogEngine::open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())) - .unwrap(); + let engine = open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())).unwrap(); let data = vec![b'x'; 1024]; // rewrite:[1 ..10] @@ -1202,9 +1233,7 @@ pub(crate) mod tests { ..Default::default() }; - let engine = - RaftLogEngine::open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())) - .unwrap(); + let engine = open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())).unwrap(); let data = vec![b'x'; 1024]; for index in 0..100 { engine.append(1, index, index + 1, Some(&data)); @@ -1263,9 +1292,7 @@ pub(crate) mod tests { ..Default::default() }; - let engine = - RaftLogEngine::open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())) - .unwrap(); + let engine = open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())).unwrap(); let data = vec![b'x'; 1024]; // write 50 small entries into region 1~3, it should trigger force compact. for rid in 1..=3 { @@ -1318,9 +1345,7 @@ pub(crate) mod tests { purge_threshold: ReadableSize::kb(80), ..Default::default() }; - let engine = - RaftLogEngine::open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())) - .unwrap(); + let engine = open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())).unwrap(); let data = vec![b'x'; 1024]; // Put 100 entries into 10 regions. @@ -1382,9 +1407,7 @@ pub(crate) mod tests { dir: dir.path().to_str().unwrap().to_owned(), ..Default::default() }; - let engine = - RaftLogEngine::open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())) - .unwrap(); + let engine = open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())).unwrap(); let mut log_batch = LogBatch::default(); let empty_entry = Entry::new(); @@ -1442,9 +1465,7 @@ pub(crate) mod tests { dir: dir.path().to_str().unwrap().to_owned(), ..Default::default() }; - let engine = - RaftLogEngine::open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())) - .unwrap(); + let engine = open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())).unwrap(); let data = vec![b'x'; 16]; let cases = [[false, false], [false, true], [true, true]]; for (i, writes) in cases.iter().enumerate() { @@ -1470,9 +1491,7 @@ pub(crate) mod tests { dir: dir.path().to_str().unwrap().to_owned(), ..Default::default() }; - let engine = - RaftLogEngine::open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())) - .unwrap(); + let engine = open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())).unwrap(); let data = vec![b'x'; 1024]; for rid in 1..21 { @@ -1503,9 +1522,7 @@ pub(crate) mod tests { target_file_size: ReadableSize(1), ..Default::default() }; - let engine = - RaftLogEngine::open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())) - .unwrap(); + let engine = open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())).unwrap(); let data = vec![b'x'; 2 * 1024 * 1024]; for rid in 1..=3 { @@ -1663,7 +1680,7 @@ pub(crate) mod tests { ..Default::default() }; - let engine = RaftLogEngine::open_with_file_system(cfg, fs.clone()).unwrap(); + let engine = open_with_file_system(cfg, fs.clone()).unwrap(); for bs in batches.iter_mut() { for batch in bs.iter_mut() { engine.write(batch, false).unwrap(); @@ -1724,7 +1741,7 @@ pub(crate) mod tests { }; let fs = Arc::new(ObfuscatedFileSystem::default()); - let engine = RaftLogEngine::open_with_file_system(cfg.clone(), fs.clone()).unwrap(); + let engine = open_with_file_system(cfg.clone(), fs.clone()).unwrap(); for rid in 1..=50 { engine.append(rid, 1, 6, Some(&entry_data)); } @@ -1761,7 +1778,7 @@ pub(crate) mod tests { ) .unwrap(); - let engine = RaftLogEngine::open_with_file_system(cfg, fs).unwrap(); + let engine = open_with_file_system(cfg, fs).unwrap(); for rid in 1..25 { engine.scan_entries(rid, 1, 6, |_, _, d| { assert_eq!(d, &entry_data); @@ -1789,7 +1806,7 @@ pub(crate) mod tests { }; let fs = Arc::new(ObfuscatedFileSystem::default()); - let engine = RaftLogEngine::open_with_file_system(cfg.clone(), fs.clone()).unwrap(); + let engine = open_with_file_system(cfg.clone(), fs.clone()).unwrap(); for rid in 1..=50 { engine.append(rid, 1, 6, Some(&entry_data)); } @@ -1823,7 +1840,7 @@ pub(crate) mod tests { ) .unwrap(); - let engine = RaftLogEngine::open_with_file_system(cfg, fs).unwrap(); + let engine = open_with_file_system(cfg, fs).unwrap(); for rid in 1..25 { if existing_emptied.contains(&rid) || incoming_emptied.contains(&rid) { continue; @@ -1870,7 +1887,7 @@ pub(crate) mod tests { }; let fs = Arc::new(ObfuscatedFileSystem::default()); - let engine = RaftLogEngine::open_with_file_system(cfg.clone(), fs.clone()).unwrap(); + let engine = open_with_file_system(cfg.clone(), fs.clone()).unwrap(); for rid in 1..=50 { engine.append(rid, 1, 6, Some(&entry_data)); } @@ -1891,11 +1908,11 @@ pub(crate) mod tests { // Corrupt a log batch. f.set_len(f.metadata().unwrap().len() - 1).unwrap(); - RaftLogEngine::open_with_file_system(cfg.clone(), fs.clone()).unwrap(); + open_with_file_system(cfg.clone(), fs.clone()).unwrap(); // Corrupt the file header. f.set_len(1).unwrap(); - RaftLogEngine::open_with_file_system(cfg, fs).unwrap(); + open_with_file_system(cfg, fs).unwrap(); } #[test] @@ -1912,7 +1929,7 @@ pub(crate) mod tests { }; let fs = Arc::new(ObfuscatedFileSystem::default()); - let engine = RaftLogEngine::open_with_file_system(cfg.clone(), fs.clone()).unwrap(); + let engine = open_with_file_system(cfg.clone(), fs.clone()).unwrap(); for rid in 1..=10 { engine.append(rid, 1, 11, Some(&entry_data)); } @@ -1920,7 +1937,7 @@ pub(crate) mod tests { assert!(RaftLogEngine::open(cfg.clone()).is_err()); - let engine = RaftLogEngine::open_with_file_system(cfg, fs).unwrap(); + let engine = open_with_file_system(cfg, fs).unwrap(); for rid in 1..10 { engine.scan_entries(rid, 1, 11, |_, _, d| { assert_eq!(d, &entry_data); @@ -1972,7 +1989,7 @@ pub(crate) mod tests { let fs = Arc::new(ObfuscatedFileSystem::default()); let rid = 1; - let engine = RaftLogEngine::open_with_file_system(cfg, fs).unwrap(); + let engine = open_with_file_system(cfg, fs).unwrap(); assert!(engine.is_empty()); engine.append(rid, 1, 11, Some(&entry_data)); assert!(!engine.is_empty()); @@ -2109,7 +2126,7 @@ pub(crate) mod tests { ..Default::default() }; let fs = Arc::new(DeleteMonitoredFileSystem::new()); - let engine = RaftLogEngine::open_with_file_system(cfg, fs.clone()).unwrap(); + let engine = open_with_file_system(cfg, fs.clone()).unwrap(); for rid in 1..=10 { engine.append(rid, 1, 11, Some(&entry_data)); } @@ -2166,7 +2183,7 @@ pub(crate) mod tests { ..Default::default() }; let fs = Arc::new(DeleteMonitoredFileSystem::new()); - let engine = RaftLogEngine::open_with_file_system(cfg, fs.clone()).unwrap(); + let engine = open_with_file_system(cfg, fs.clone()).unwrap(); let reserved_start = *fs.reserved_metadata.lock().unwrap().first().unwrap(); for rid in 1..=10 { @@ -2274,14 +2291,14 @@ pub(crate) mod tests { assert!(cfg_v2.recycle_capacity() > 0); // Prepare files with format_version V1 { - let engine = RaftLogEngine::open_with_file_system(cfg_v1.clone(), fs.clone()).unwrap(); + let engine = open_with_file_system(cfg_v1.clone(), fs.clone()).unwrap(); for rid in 1..=10 { engine.append(rid, 1, 11, Some(&entry_data)); } } // Reopen the Engine with V2 and purge { - let engine = RaftLogEngine::open_with_file_system(cfg_v2.clone(), fs.clone()).unwrap(); + let engine = open_with_file_system(cfg_v2.clone(), fs.clone()).unwrap(); let (start, _) = engine.file_span(LogQueue::Append); for rid in 6..=10 { engine.append(rid, 11, 20, Some(&entry_data)); @@ -2295,7 +2312,7 @@ pub(crate) mod tests { } // Reopen the Engine with V1 -> V2 and purge { - let engine = RaftLogEngine::open_with_file_system(cfg_v1, fs.clone()).unwrap(); + let engine = open_with_file_system(cfg_v1, fs.clone()).unwrap(); let (start, _) = engine.file_span(LogQueue::Append); for rid in 6..=10 { engine.append(rid, 20, 30, Some(&entry_data)); @@ -2309,7 +2326,7 @@ pub(crate) mod tests { assert_eq!(engine.file_span(LogQueue::Append).0, start); let file_count = engine.file_count(Some(LogQueue::Append)); drop(engine); - let engine = RaftLogEngine::open_with_file_system(cfg_v2, fs).unwrap(); + let engine = open_with_file_system(cfg_v2, fs).unwrap(); assert_eq!(engine.file_span(LogQueue::Append).0, start); assert_eq!(engine.file_count(Some(LogQueue::Append)), file_count); // Mark all regions obsolete. @@ -2340,7 +2357,7 @@ pub(crate) mod tests { enable_log_recycle: false, ..Default::default() }; - let engine = RaftLogEngine::open_with_file_system(cfg, file_system.clone()).unwrap(); + let engine = open_with_file_system(cfg, file_system.clone()).unwrap(); let (start, _) = engine.file_span(LogQueue::Append); // Only one valid file left, the last one => active_file. assert_eq!(engine.file_count(Some(LogQueue::Append)), 1); @@ -2362,8 +2379,7 @@ pub(crate) mod tests { prefill_for_recycle: true, ..Default::default() }; - let engine = - RaftLogEngine::open_with_file_system(cfg.clone(), file_system.clone()).unwrap(); + let engine = open_with_file_system(cfg.clone(), file_system.clone()).unwrap(); let (start, end) = engine.file_span(LogQueue::Append); // Only one valid file left, the last one => active_file. assert_eq!(start, end); @@ -2386,8 +2402,7 @@ pub(crate) mod tests { purge_threshold: ReadableSize(50), ..cfg }; - let engine = - RaftLogEngine::open_with_file_system(cfg_v2.clone(), file_system.clone()).unwrap(); + let engine = open_with_file_system(cfg_v2.clone(), file_system.clone()).unwrap(); assert_eq!(engine.file_span(LogQueue::Append), (start, end)); assert!(recycled_count > file_system.inner.file_count() - engine.file_count(None)); // Recycled files have filled the LogQueue::Append, purge_expired_files won't @@ -2411,7 +2426,7 @@ pub(crate) mod tests { prefill_for_recycle: false, ..cfg_v2 }; - let engine = RaftLogEngine::open_with_file_system(cfg_v3, file_system.clone()).unwrap(); + let engine = open_with_file_system(cfg_v3, file_system.clone()).unwrap(); assert_eq!(file_system.inner.file_count(), engine.file_count(None)); } @@ -2432,7 +2447,7 @@ pub(crate) mod tests { let key = vec![b'x'; 2]; let value = vec![b'y'; 8]; - let engine = RaftLogEngine::open_with_file_system(cfg, fs).unwrap(); + let engine = open_with_file_system(cfg, fs).unwrap(); let mut data = HashSet::new(); let mut rid = 1; // Directly write to pipe log. @@ -2573,7 +2588,7 @@ pub(crate) mod tests { ..Default::default() }; let fs = Arc::new(ObfuscatedFileSystem::default()); - let engine = RaftLogEngine::open_with_file_system(cfg, fs).unwrap(); + let engine = open_with_file_system(cfg, fs).unwrap(); let value = vec![b'y'; 8]; let mut log_batch = LogBatch::default(); log_batch.put_unchecked(1, crate::make_internal_key(&[1]), value.clone()); @@ -2649,8 +2664,7 @@ pub(crate) mod tests { }; // Step 1: write data into the main directory. - let engine = - RaftLogEngine::open_with_file_system(cfg.clone(), file_system.clone()).unwrap(); + let engine = open_with_file_system(cfg.clone(), file_system.clone()).unwrap(); for rid in 1..=10 { engine.append(rid, 1, 10, Some(&entry_data)); } @@ -2664,7 +2678,7 @@ pub(crate) mod tests { purge_threshold: ReadableSize(40), ..cfg }; - let engine = RaftLogEngine::open_with_file_system(cfg_2, file_system).unwrap(); + let engine = open_with_file_system(cfg_2, file_system).unwrap(); assert_eq!(number_of_files(sec_dir.path()), number_of_files(dir.path())); for rid in 1..=10 { assert_eq!(engine.first_index(rid).unwrap(), 1); @@ -2713,8 +2727,7 @@ pub(crate) mod tests { }; // Step 1: write data into the main directory. - let engine = - RaftLogEngine::open_with_file_system(cfg.clone(), file_system.clone()).unwrap(); + let engine = open_with_file_system(cfg.clone(), file_system.clone()).unwrap(); for rid in 1..=10 { engine.append(rid, 1, 10, Some(&entry_data)); } @@ -2740,8 +2753,7 @@ pub(crate) mod tests { // abnormal case - Empty second dir { std::fs::remove_dir_all(sec_dir.path()).unwrap(); - let engine = - RaftLogEngine::open_with_file_system(cfg.clone(), file_system.clone()).unwrap(); + let engine = open_with_file_system(cfg.clone(), file_system.clone()).unwrap(); assert_eq!(number_of_files(sec_dir.path()), number_of_files(dir.path())); } // abnormal case - Missing some append files in second dir @@ -2762,7 +2774,7 @@ pub(crate) mod tests { file_count += 1; } } - let engine = RaftLogEngine::open_with_file_system(cfg, file_system).unwrap(); + let engine = open_with_file_system(cfg.clone(), file_system.clone()).unwrap(); assert_eq!(number_of_files(sec_dir.path()), number_of_files(dir.path())); } // abnormal case - Missing some rewrite files in second dir @@ -2783,7 +2795,7 @@ pub(crate) mod tests { file_count += 1; } } - let engine = RaftLogEngine::open_with_file_system(cfg, file_system).unwrap(); + let engine = open_with_file_system(cfg, file_system).unwrap(); assert_eq!(number_of_files(sec_dir.path()), number_of_files(dir.path())); } // abnormal case - Missing some reserve files in second dir @@ -2815,8 +2827,7 @@ pub(crate) mod tests { }; { // Step 1: write data into the main directory. - let engine = - RaftLogEngine::open_with_file_system(cfg.clone(), file_system.clone()).unwrap(); + let engine = open_with_file_system(cfg.clone(), file_system.clone()).unwrap(); for rid in 1..=10 { engine.append(rid, 1, 10, Some(&entry_data)); } @@ -2852,7 +2863,7 @@ pub(crate) mod tests { purge_threshold: ReadableSize(40), ..cfg.clone() }; - let engine = RaftLogEngine::open_with_file_system(cfg_2, file_system.clone()).unwrap(); + let engine = open_with_file_system(cfg_2, file_system.clone()).unwrap(); assert!(number_of_files(spill_dir.path()) > 0); for rid in 1..=10 { assert_eq!(engine.first_index(rid).unwrap(), 1); @@ -2879,7 +2890,7 @@ pub(crate) mod tests { ..cfg }; drop(engine); - let engine = RaftLogEngine::open_with_file_system(cfg_3, file_system).unwrap(); + let engine = open_with_file_system(cfg_3, file_system).unwrap(); assert!(number_of_files(spill_dir.path()) > 0); for rid in 1..=10 { assert_eq!(engine.first_index(rid).unwrap(), 20); diff --git a/src/env/default.rs b/src/env/default.rs index c7bdd6f7..7de4c0de 100644 --- a/src/env/default.rs +++ b/src/env/default.rs @@ -271,6 +271,7 @@ impl WriteExt for LogFile { } } +#[derive(Clone)] pub struct DefaultFileSystem; impl FileSystem for DefaultFileSystem { diff --git a/src/env/double_write.rs b/src/env/double_write.rs index a444a01e..19496aee 100644 --- a/src/env/double_write.rs +++ b/src/env/double_write.rs @@ -1,7 +1,6 @@ // Copyright (c) 2017-present, PingCAP, Inc. Licensed under Apache-2.0. use crate::file_pipe_log::log_file::build_file_reader; -use crate::file_pipe_log::pipe::File; use crate::file_pipe_log::pipe_builder::FileName; use crate::file_pipe_log::reader::LogItemBatchFileReader; use crate::file_pipe_log::FileNameExt; @@ -12,7 +11,9 @@ use crate::{Error, Result}; use crossbeam::channel::unbounded; use crossbeam::channel::Sender; use fail::fail_point; -use log::{info, warn, Log}; +use log::{info, warn}; +use prometheus::core::Atomic; +use std::cell::Cell; use std::fs; use std::io::{Read, Result as IoResult, Seek, SeekFrom, Write}; use std::path::Path; @@ -20,19 +21,20 @@ use std::path::PathBuf; use std::sync::atomic::AtomicU64; use std::sync::atomic::Ordering; use std::sync::Arc; +use std::sync::Mutex; use std::sync::RwLock; use std::thread; use crate::env::default::LogFd; use crate::env::DefaultFileSystem; use crate::env::{FileSystem, Handle, Permission, WriteExt}; -use futures::channel::oneshot; use futures::executor::block_on; use futures::select; +use futures::{channel::oneshot, Future}; use either::Either; -type Callback = Box>) + Send>; +type Callback = Box) + Send>; #[derive(PartialEq)] enum Task { @@ -47,6 +49,39 @@ enum Task { dst_path: PathBuf, }, Stop, +} + +enum TaskRes { + Create(LogFd), + Open(LogFd), + Delete, + Rename, +} + +#[derive(PartialEq, Clone)] +enum HandleTask { + Truncate { + offset: usize, + }, + FileSize, + Sync, + Write { + offset: usize, + bytes: Vec, + }, + Allocate { + offset: usize, + size: usize, + }, + Stop, +} + +enum HandleTaskRes { + Truncate, + FileSize(usize), + Sync, + Write(usize), + Allocate, } #[derive(Default)] @@ -65,13 +100,16 @@ fn replace_path(path: &Path, from: &Path, to: &Path) -> PathBuf { } } -pub struct HedgedFileSystem { - inner: Arc, +pub struct HedgedFileSystem { + base: DefaultFileSystem, path1: PathBuf, path2: PathBuf, - disk1: Sender<(Task, Callback)>, - disk2: Sender<(Task, Callback)>, + disk1: Sender<(Task, Callback)>, + disk2: Sender<(Task, Callback)>, + + counter1: Arc, + counter2: Arc, handle1: Option>, handle2: Option>, @@ -80,38 +118,45 @@ pub struct HedgedFileSystem { // TODO: read both dir at recovery, maybe no need? cause operations are to both // disks TODO: consider encryption -impl HedgedFileSystem { - pub fn new(inner: Arc, path1: PathBuf, path2: PathBuf) -> Self { - let (tx1, rx1) = unbounded::<(Task, Callback)>(); - let (tx2, rx2) = unbounded::<(Task, Callback)>(); - let fs1 = inner.clone(); - let handle1 = thread::spawn(|| { +impl HedgedFileSystem { + pub fn new(base: DefaultFileSystem, path1: PathBuf, path2: PathBuf) -> Self { + let (tx1, rx1) = unbounded::<(Task, Callback)>(); + let (tx2, rx2) = unbounded::<(Task, Callback)>(); + let counter1 = Arc::new(AtomicU64::new(0)); + let counter2 = Arc::new(AtomicU64::new(0)); + let counter1_clone = counter1.clone(); + let fs1 = base.clone(); + let handle1 = thread::spawn(move || { for (task, cb) in rx1 { - if task == Task::Stop { + if let Task::Stop = task { break; } fail_point!("double_write::thread1"); - let res = Self::handle(&fs1, task); + let res = Self::process(&fs1, task); cb(res); + counter1_clone.fetch_add(1, Ordering::Relaxed); } }); - let fs2 = inner.clone(); - let handle2 = thread::spawn(|| { - let fs = DefaultFileSystem {}; + let counter2_clone = counter2.clone(); + let fs2 = base.clone(); + let handle2 = thread::spawn(move || { for (task, cb) in rx2 { - if task == Task::Stop { + if let Task::Stop = task { break; } - let res = Self::handle(&fs2, task); + let res = Self::process(&fs2, task); cb(res); + counter2_clone.fetch_add(1, Ordering::Relaxed); } }); Self { - inner, + base, path1, path2, disk1: tx1, disk2: tx2, + counter1, + counter2, handle1: Some(handle1), handle2: Some(handle2), } @@ -127,7 +172,8 @@ impl HedgedFileSystem { match count1.cmp(&count2) { std::cmp::Ordering::Equal => { - // TODO: still need to catch up + // still need to catch up + self.catch_up_diff(files1, files2); return Ok(()); } std::cmp::Ordering::Less => { @@ -200,6 +246,7 @@ impl HedgedFileSystem { // check file size is not enough, treat the last files differently considering // the recycle, always copy the last file + // TODO: only copy diff part if let Some(last_file) = fromFiles.append_file.last() { let to = replace_path( last_file.path.as_ref(), @@ -288,9 +335,8 @@ impl HedgedFileSystem { if let Some(f) = files.append_file.last() { let recovery_read_block_size = 1024; let mut reader = LogItemBatchFileReader::new(recovery_read_block_size); - // TODO: change file system - let handle = Arc::new(DefaultFileSystem {}.open(&f.path, Permission::ReadOnly)?); - let file_reader = build_file_reader(&DefaultFileSystem {}, handle)?; + let handle = Arc::new(self.base.open(&f.path, Permission::ReadOnly)?); + let file_reader = build_file_reader(&self.base, handle)?; match reader.open( FileId { queue: LogQueue::Append, @@ -326,11 +372,18 @@ impl HedgedFileSystem { self.disk1.send((task1, cb1)).unwrap(); self.disk2.send((task2, cb2)).unwrap(); + let resolve = |res: TaskRes| -> LogFd { + match res { + TaskRes::Create(h) => h, + TaskRes::Open(h) => h, + _ => unreachable!(), + } + }; select! { - res1 = f1 => res1.unwrap().map(|h| HedgedHandle::new( - Either::Right(h.unwrap()), Either::Left(f2) )), - res2 = f2 => res2.unwrap().map(|h| HedgedHandle::new( - Either::Left(f1), Either::Right(h.unwrap()) )), + res1 = f1 => res1.unwrap().map(|res| HedgedHandle::new( + FutureHandle::new_owned(resolve(res)), FutureHandle::new(f2) , self.counter1.clone(), self.counter2.clone())), + res2 = f2 => res2.unwrap().map(|res| HedgedHandle::new( + FutureHandle::new(f1), FutureHandle::new_owned(resolve(res)) , self.counter1.clone(), self.counter2.clone())), } } @@ -347,20 +400,20 @@ impl HedgedFileSystem { } #[inline] - fn handle(file_system: &F, task: Task) -> IoResult> { + fn process(file_system: &DefaultFileSystem, task: Task) -> IoResult { match task { - Task::Create(path) => file_system.create(path).map(Some), - Task::Open { path, perm } => file_system.open(path, perm).map(Some), - Task::Delete(path) => file_system.delete(path).map(|_| None), - Task::Rename { src_path, dst_path } => { - file_system.rename(src_path, dst_path).map(|_| None) - } + Task::Create(path) => file_system.create(path).map(|h| TaskRes::Create(h)), + Task::Open { path, perm } => file_system.open(path, perm).map(|h| TaskRes::Open(h)), + Task::Delete(path) => file_system.delete(path).map(|_| TaskRes::Delete), + Task::Rename { src_path, dst_path } => file_system + .rename(src_path, dst_path) + .map(|_| TaskRes::Rename), Task::Stop => unreachable!(), } } } -impl Drop for HedgedFileSystem { +impl Drop for HedgedFileSystem { fn drop(&mut self) { self.disk1.send((Task::Stop, Box::new(|_| {}))).unwrap(); self.disk2.send((Task::Stop, Box::new(|_| {}))).unwrap(); @@ -369,7 +422,7 @@ impl Drop for HedgedFileSystem { } } -impl FileSystem for HedgedFileSystem { +impl FileSystem for HedgedFileSystem { type Handle = HedgedHandle; type Reader = HedgedReader; type Writer = HedgedWriter; @@ -431,48 +484,89 @@ impl FileSystem for HedgedFileSystem { } } -#[derive(Clone, PartialEq)] -enum FileTask { - Truncate(usize), - FileSize, - Sync, - Write { offset: usize, bytes: Vec }, - Allocate { offset: usize, size: usize }, - Stop, +pub struct FutureHandle { + inner: Either>, Arc>, +} + +impl FutureHandle { + fn new(rx: oneshot::Receiver>) -> Self { + Self { + inner: Either::Left(rx), + } + } + fn new_owned(h: LogFd) -> Self { + Self { + inner: Either::Right(Arc::new(h)), + } + } + + fn get(self) -> Arc { + let fd = match self.inner { + Either::Left(rx) => { + // TODO: should we handle the second disk io error + match block_on(rx).unwrap().unwrap() { + TaskRes::Open(fd) => Arc::new(fd), + TaskRes::Create(fd) => Arc::new(fd), + _ => unreachable!(), + } + } + Either::Right(w) => w, + }; + fd + } + + // fn try_get(&self) -> Option> { + // let mut set = false; + // let fd = match self.inner { + // Either::Left(rx) => { + // set = true; + // // TODO: should we handle the second disk io error + // match rx.try_recv().unwrap() { + // None => return None, + // Some(Err(_)) => panic!(), + // Some(Ok(TaskRes::Open(fd))) => Arc::new(fd), + // Some(Ok(TaskRes::Create(fd))) => Arc::new(fd), + // _ => unreachable!(), + // } + // } + // Either::Right(w) => w.clone(), + // }; + // if set { + // self.inner = Either::Right(fd.clone()); + // } + // Some(fd) + // } } pub struct HedgedHandle { - disk1: Sender<(FileTask, Callback)>, - disk2: Sender<(FileTask, Callback)>, + disk1: Sender<(HandleTask, Callback)>, + disk2: Sender<(HandleTask, Callback)>, counter1: Arc, counter2: Arc, fd1: Arc>>>, fd2: Arc>>>, - handle1: Option>, - handle2: Option>, + t1: Option>, + t2: Option>, } impl HedgedHandle { - pub fn new( - file1: Either>>, LogFd>, - file2: Either>>, LogFd>, - ) -> Self { - let (tx1, rx1) = unbounded::<(FileTask, Callback)>(); - let (tx2, rx2) = unbounded::<(FileTask, Callback)>(); + pub fn new(handle1: FutureHandle, handle2: FutureHandle, counter1: Arc, counter2: Arc ) -> Self { + let (tx1, rx1) = unbounded::<(HandleTask, Callback)>(); + let (tx2, rx2) = unbounded::<(HandleTask, Callback)>(); let counter1 = Arc::new(AtomicU64::new(0)); let counter2 = Arc::new(AtomicU64::new(0)); let fd1 = Arc::new(RwLock::new(None)); let fd2 = Arc::new(RwLock::new(None)); - let handle1 = { + let t1 = { let fd1 = fd1.clone(); let counter1 = counter1.clone(); thread::spawn(move || { - let fd = Arc::new(Self::resolve(file1)); + let fd = handle1.get(); fd1.write().unwrap().replace(fd.clone()); for (task, cb) in rx1 { - if task == FileTask::Stop { + if task == HandleTask::Stop { break; } let res = Self::handle(&fd, task); @@ -481,14 +575,14 @@ impl HedgedHandle { } }) }; - let handle2 = { + let t2 = { let fd2 = fd2.clone(); let counter2 = counter2.clone(); thread::spawn(move || { - let fd = Arc::new(Self::resolve(file2)); + let fd = handle2.get(); fd2.write().unwrap().replace(fd.clone()); for (task, cb) in rx2 { - if task == FileTask::Stop { + if task == HandleTask::Stop { break; } let res = Self::handle(&fd, task); @@ -504,29 +598,19 @@ impl HedgedHandle { counter2, fd1, fd2, - handle1: Some(handle1), - handle2: Some(handle2), - } - } - - fn resolve(file: Either>>, LogFd>) -> LogFd { - match file { - Either::Left(f) => { - // TODO: should we handle the second disk io error - block_on(f).unwrap().unwrap().unwrap() - } - Either::Right(fd) => fd, + t1: Some(t1), + t2: Some(t2), } } - fn handle(fd: &LogFd, task: FileTask) -> IoResult> { + fn handle(fd: &LogFd, task: HandleTask) -> IoResult { match task { - FileTask::Truncate(offset) => fd.truncate(offset).map(|_| None), - FileTask::FileSize => fd.file_size().map(Some), - FileTask::Sync => fd.sync().map(|_| None), - FileTask::Write { offset, bytes } => fd.write(offset, &bytes).map(Some), - FileTask::Allocate { offset, size } => fd.allocate(offset, size).map(|_| None), - FileTask::Stop => unreachable!(), + HandleTask::Truncate{offset} => fd.truncate(offset).map(|_| HandleTaskRes::Truncate), + HandleTask::FileSize => fd.file_size().map(|s| HandleTaskRes::FileSize(s)), + HandleTask::Sync => fd.sync().map(|_| HandleTaskRes::Sync), + HandleTask::Write { offset, bytes } => fd.write(offset, &bytes).map(|s| HandleTaskRes::Write(s)), + HandleTask::Allocate { offset, size } => fd.allocate(offset, size).map(|_| HandleTaskRes::Allocate), + HandleTask::Stop => unreachable!(), } } @@ -555,18 +639,19 @@ impl HedgedHandle { } fn write(&self, offset: usize, content: &[u8]) -> IoResult { - block_on(self.wait_one(FileTask::Write { + block_on(self.wait_one(HandleTask::Write { offset, bytes: content.to_vec(), })) - .map(|x| x.unwrap()) + .map(|res| if let HandleTaskRes::Write(size) = res { size } else { unreachable!() }) } fn allocate(&self, offset: usize, size: usize) -> IoResult<()> { - block_on(self.wait_one(FileTask::Allocate { offset, size })).map(|_| ()) + block_on(self.wait_one(HandleTask::Allocate { offset, size })) + .map(|_| ()) } - async fn wait_one(&self, task: FileTask) -> IoResult> { + async fn wait_one(&self, task: HandleTask) -> IoResult { let (cb1, mut f1) = paired_future_callback(); let (cb2, mut f2) = paired_future_callback(); self.disk1.send((task.clone(), cb1)).unwrap(); @@ -581,24 +666,24 @@ impl HedgedHandle { impl Drop for HedgedHandle { fn drop(&mut self) { - self.disk1.send((FileTask::Stop, Box::new(|_| {}))).unwrap(); - self.disk2.send((FileTask::Stop, Box::new(|_| {}))).unwrap(); - self.handle1.take().unwrap().join().unwrap(); - self.handle2.take().unwrap().join().unwrap(); + self.disk1.send((HandleTask::Stop, Box::new(|_| {}))).unwrap(); + self.disk2.send((HandleTask::Stop, Box::new(|_| {}))).unwrap(); + self.t1.take().unwrap().join().unwrap(); + self.t2.take().unwrap().join().unwrap(); } } impl Handle for HedgedHandle { fn truncate(&self, offset: usize) -> IoResult<()> { - block_on(self.wait_one(FileTask::Truncate(offset))).map(|_| ()) + block_on(self.wait_one(HandleTask::Truncate{offset})).map(|_| ()) } fn file_size(&self) -> IoResult { - block_on(self.wait_one(FileTask::FileSize)).map(|x| x.unwrap()) + block_on(self.wait_one(HandleTask::FileSize)).map(|res| if let HandleTaskRes::FileSize(size) = res { size } else { unreachable!() }) } fn sync(&self) -> IoResult<()> { - block_on(self.wait_one(FileTask::Sync)).map(|_| ()) + block_on(self.wait_one(HandleTask::Sync)).map(|_| ()) } } @@ -684,8 +769,8 @@ impl Read for HedgedReader { } } -pub fn paired_future_callback( -) -> (Callback, oneshot::Receiver>>) { +pub fn paired_future_callback() -> (Callback, oneshot::Receiver>) +{ let (tx, future) = oneshot::channel(); let callback = Box::new(move |result| { let r = tx.send(result); diff --git a/src/env/mod.rs b/src/env/mod.rs index a187c534..a2b40571 100644 --- a/src/env/mod.rs +++ b/src/env/mod.rs @@ -19,7 +19,7 @@ pub enum Permission { } /// FileSystem -pub trait FileSystem: Send + Sync { +pub trait FileSystem: Send + Sync + 'static { type Handle: Send + Sync + Handle; type Reader: Seek + Read + Send; type Writer: Seek + Write + Send + WriteExt; From 394beedea1e94df3ca51d9cbece27bc36b5af804 Mon Sep 17 00:00:00 2001 From: Connor1996 Date: Thu, 24 Aug 2023 15:58:15 +0800 Subject: [PATCH 07/32] add recover ext Signed-off-by: Connor1996 --- src/engine.rs | 1 + src/env/double_write.rs | 61 ++++++++++++++++++++++++--------------- src/env/mod.rs | 20 ++++++++++++- src/file_pipe_log/pipe.rs | 5 ++++ src/memtable.rs | 2 +- src/purge.rs | 4 +++ 6 files changed, 67 insertions(+), 26 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index 04d98dcb..a3c0d583 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -125,6 +125,7 @@ where mut listeners: Vec>, ) -> Result>> { cfg.sanitize()?; + file_system.bootstrap()?; listeners.push(Arc::new(PurgeHook::default()) as Arc); let start = Instant::now(); diff --git a/src/env/double_write.rs b/src/env/double_write.rs index 19496aee..125b7195 100644 --- a/src/env/double_write.rs +++ b/src/env/double_write.rs @@ -162,30 +162,6 @@ impl HedgedFileSystem { } } - pub fn bootstrap(&self) -> Result<()> { - // catch up diff - let files1 = self.get_files(&self.path1)?; - let files2 = self.get_files(&self.path2)?; - - let count1 = self.get_latest_valid_seq(&files1)?; - let count2 = self.get_latest_valid_seq(&files2)?; - - match count1.cmp(&count2) { - std::cmp::Ordering::Equal => { - // still need to catch up - self.catch_up_diff(files1, files2); - return Ok(()); - } - std::cmp::Ordering::Less => { - self.catch_up_diff(files2, files1)?; - } - std::cmp::Ordering::Greater => { - self.catch_up_diff(files1, files2)?; - } - } - Ok(()) - } - fn catch_up_diff(&self, fromFiles: Files, toFiles: Files) -> Result<()> { let check_files = |from: &Vec, to: &Vec| -> IoResult<()> { let mut iter1 = from.iter().peekable(); @@ -422,6 +398,43 @@ impl Drop for HedgedFileSystem { } } +impl RecoverExt for HedgedFileSystem { + fn bootstrap(&self) -> Result<()> { + // catch up diff + let files1 = self.get_files(&self.path1)?; + let files2 = self.get_files(&self.path2)?; + + let count1 = self.get_latest_valid_seq(&files1)?; + let count2 = self.get_latest_valid_seq(&files2)?; + + match count1.cmp(&count2) { + std::cmp::Ordering::Equal => { + // still need to catch up, but only diff + self.catch_up_diff(files1, files2); + return Ok(()); + } + std::cmp::Ordering::Less => { + self.catch_up_diff(files2, files1)?; + } + std::cmp::Ordering::Greater => { + self.catch_up_diff(files1, files2)?; + } + } + Ok(()) + } + + fn need_recover(&self) -> bool { + } + + fn is_in_recover(&self) -> bool { + false + } + + fn trigger_recover(&self) { + () + } +} + impl FileSystem for HedgedFileSystem { type Handle = HedgedHandle; type Reader = HedgedReader; diff --git a/src/env/mod.rs b/src/env/mod.rs index a2b40571..25e1707b 100644 --- a/src/env/mod.rs +++ b/src/env/mod.rs @@ -19,7 +19,7 @@ pub enum Permission { } /// FileSystem -pub trait FileSystem: Send + Sync + 'static { +pub trait FileSystem: Send + Sync + 'static + RecoverExt { type Handle: Send + Sync + Handle; type Reader: Seek + Read + Send; type Writer: Seek + Write + Send + WriteExt; @@ -65,6 +65,24 @@ pub trait FileSystem: Send + Sync + 'static { fn new_writer(&self, handle: Arc) -> Result; } +pub trait RecoverExt { + fn bootstrap() -> Result<()> { + Ok(()) + } + + fn need_recover(&self) -> bool { + false + } + + fn is_in_recover(&self) -> bool { + false + } + + fn trigger_recover(&self) { + () + } +} + pub trait Handle { fn truncate(&self, offset: usize) -> Result<()>; diff --git a/src/file_pipe_log/pipe.rs b/src/file_pipe_log/pipe.rs index 5a5916ea..bbec7bc9 100644 --- a/src/file_pipe_log/pipe.rs +++ b/src/file_pipe_log/pipe.rs @@ -513,6 +513,11 @@ impl PipeLog for DualPipes { queue: LogQueue, bytes: &mut T, ) -> Result { + if self.file_system.need_recover() { + self.pipes[LogQueue::append].rotate(); + self.pipes[LogQueue::rewrite].rotate(); + self.file_system.trigger_recover(); + } self.pipes[queue as usize].append(bytes) } diff --git a/src/memtable.rs b/src/memtable.rs index d46ba68b..a667ce24 100644 --- a/src/memtable.rs +++ b/src/memtable.rs @@ -1060,7 +1060,7 @@ impl MemTableAccessor { raft == 3, |_| {} ); - match item.content { + match item.content LogItemContent::EntryIndexes(entries_to_add) => { memtable.write().append(entries_to_add.0); } diff --git a/src/purge.rs b/src/purge.rs index cb76f776..a4c220cd 100644 --- a/src/purge.rs +++ b/src/purge.rs @@ -75,6 +75,10 @@ where pub fn purge_expired_files(&self) -> Result> { let _t = StopWatch::new(&*ENGINE_PURGE_DURATION_HISTOGRAM); + if self.file_system().is_in_recover() { + info!("skip purge due to in recover"); + return Ok(vec![]); + } let guard = self.force_rewrite_candidates.try_lock(); if guard.is_none() { warn!("Unable to purge expired files: locked"); From adbd517a5e9c2eeab4488b5264f15243fff0b900 Mon Sep 17 00:00:00 2001 From: Connor1996 Date: Thu, 31 Aug 2023 14:09:50 +0800 Subject: [PATCH 08/32] refactor Signed-off-by: Connor1996 --- src/engine.rs | 126 +++++++------ src/env/default.rs | 3 + src/env/double_write.rs | 326 ++++++++++++++++++---------------- src/env/mod.rs | 4 +- src/env/obfuscated.rs | 3 + src/file_pipe_log/log_file.rs | 6 +- src/file_pipe_log/mod.rs | 5 +- src/file_pipe_log/pipe.rs | 10 +- src/memtable.rs | 2 +- src/purge.rs | 8 +- 10 files changed, 268 insertions(+), 225 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index a3c0d583..d964ba64 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -11,7 +11,7 @@ use log::{error, info}; use protobuf::{parse_from_bytes, Message}; use crate::consistency::ConsistencyChecker; -use crate::env::{DefaultFileSystem, FileSystem}; +use crate::env::{DefaultFileSystem, FileSystem, RecoverExt}; use crate::event_listener::EventListener; use crate::file_pipe_log::debug::LogItemReader; use crate::file_pipe_log::{DefaultMachineFactory, FilePipeLog, FilePipeLogBuilder}; @@ -90,9 +90,9 @@ impl Engine> { } } -pub fn open_with_hedged_file_system( +pub fn open_with_hedged_file_system( cfg: Config, - file_system: Arc, + file_system: Arc, ) -> Result>> { let file_system = if let Some(ref sec_dir) = cfg.second_dir { let fs = Arc::new(HedgedFileSystem::new( @@ -108,17 +108,17 @@ pub fn open_with_hedged_file_system( Engine::open_with(cfg, file_system, vec![]) } -pub fn open_with_file_system( - cfg: Config, - file_system: Arc, -) -> Result>> { - Engine::open_with(cfg, file_system, vec![]) -} - impl Engine> where F: FileSystem, { + pub fn open_with_file_system( + cfg: Config, + file_system: Arc, + ) -> Result>> { + Engine::open_with(cfg, file_system, vec![]) + } + fn open_with( mut cfg: Config, file_system: Arc, @@ -784,7 +784,7 @@ pub(crate) mod tests { dir: sub_dir.to_str().unwrap().to_owned(), ..Default::default() }; - open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())).unwrap(); + Engine::open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())).unwrap(); } #[test] @@ -802,9 +802,11 @@ pub(crate) mod tests { ..Default::default() }; - let engine = - open_with_file_system(cfg.clone(), Arc::new(ObfuscatedFileSystem::default())) - .unwrap(); + let engine = Engine::open_with_file_system( + cfg.clone(), + Arc::new(ObfuscatedFileSystem::default()), + ) + .unwrap(); assert_eq!(engine.path(), dir.path().to_str().unwrap()); let data = vec![b'x'; entry_size]; for i in 10..20 { @@ -854,7 +856,7 @@ pub(crate) mod tests { target_file_size: ReadableSize(1), ..Default::default() }; - let engine = open_with_file_system( + let engine = Engine::open_with_file_system( cfg.clone(), Arc::new(ObfuscatedFileSystem::default()), ) @@ -927,7 +929,8 @@ pub(crate) mod tests { ..Default::default() }; let rid = 1; - let engine = open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())).unwrap(); + let engine = + Engine::open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())).unwrap(); engine .scan_messages::(rid, None, None, false, |_, _| { @@ -1014,7 +1017,8 @@ pub(crate) mod tests { let mut delete_batch = LogBatch::default(); delete_batch.delete(rid, key.clone()); - let engine = open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())).unwrap(); + let engine = + Engine::open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())).unwrap(); assert_eq!( engine.get_message::(rid, &key).unwrap(), None @@ -1123,7 +1127,8 @@ pub(crate) mod tests { target_file_size: ReadableSize(1), ..Default::default() }; - let engine = open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())).unwrap(); + let engine = + Engine::open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())).unwrap(); let data = vec![b'x'; 1024]; // rewrite:[1 ..10] @@ -1234,7 +1239,8 @@ pub(crate) mod tests { ..Default::default() }; - let engine = open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())).unwrap(); + let engine = + Engine::open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())).unwrap(); let data = vec![b'x'; 1024]; for index in 0..100 { engine.append(1, index, index + 1, Some(&data)); @@ -1293,7 +1299,8 @@ pub(crate) mod tests { ..Default::default() }; - let engine = open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())).unwrap(); + let engine = + Engine::open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())).unwrap(); let data = vec![b'x'; 1024]; // write 50 small entries into region 1~3, it should trigger force compact. for rid in 1..=3 { @@ -1346,7 +1353,8 @@ pub(crate) mod tests { purge_threshold: ReadableSize::kb(80), ..Default::default() }; - let engine = open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())).unwrap(); + let engine = + Engine::open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())).unwrap(); let data = vec![b'x'; 1024]; // Put 100 entries into 10 regions. @@ -1408,7 +1416,8 @@ pub(crate) mod tests { dir: dir.path().to_str().unwrap().to_owned(), ..Default::default() }; - let engine = open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())).unwrap(); + let engine = + Engine::open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())).unwrap(); let mut log_batch = LogBatch::default(); let empty_entry = Entry::new(); @@ -1466,7 +1475,8 @@ pub(crate) mod tests { dir: dir.path().to_str().unwrap().to_owned(), ..Default::default() }; - let engine = open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())).unwrap(); + let engine = + Engine::open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())).unwrap(); let data = vec![b'x'; 16]; let cases = [[false, false], [false, true], [true, true]]; for (i, writes) in cases.iter().enumerate() { @@ -1492,7 +1502,8 @@ pub(crate) mod tests { dir: dir.path().to_str().unwrap().to_owned(), ..Default::default() }; - let engine = open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())).unwrap(); + let engine = + Engine::open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())).unwrap(); let data = vec![b'x'; 1024]; for rid in 1..21 { @@ -1523,7 +1534,8 @@ pub(crate) mod tests { target_file_size: ReadableSize(1), ..Default::default() }; - let engine = open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())).unwrap(); + let engine = + Engine::open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())).unwrap(); let data = vec![b'x'; 2 * 1024 * 1024]; for rid in 1..=3 { @@ -1681,7 +1693,7 @@ pub(crate) mod tests { ..Default::default() }; - let engine = open_with_file_system(cfg, fs.clone()).unwrap(); + let engine = Engine::open_with_file_system(cfg, fs.clone()).unwrap(); for bs in batches.iter_mut() { for batch in bs.iter_mut() { engine.write(batch, false).unwrap(); @@ -1742,7 +1754,7 @@ pub(crate) mod tests { }; let fs = Arc::new(ObfuscatedFileSystem::default()); - let engine = open_with_file_system(cfg.clone(), fs.clone()).unwrap(); + let engine = Engine::open_with_file_system(cfg.clone(), fs.clone()).unwrap(); for rid in 1..=50 { engine.append(rid, 1, 6, Some(&entry_data)); } @@ -1779,7 +1791,7 @@ pub(crate) mod tests { ) .unwrap(); - let engine = open_with_file_system(cfg, fs).unwrap(); + let engine = Engine::open_with_file_system(cfg, fs).unwrap(); for rid in 1..25 { engine.scan_entries(rid, 1, 6, |_, _, d| { assert_eq!(d, &entry_data); @@ -1807,7 +1819,7 @@ pub(crate) mod tests { }; let fs = Arc::new(ObfuscatedFileSystem::default()); - let engine = open_with_file_system(cfg.clone(), fs.clone()).unwrap(); + let engine = Engine::open_with_file_system(cfg.clone(), fs.clone()).unwrap(); for rid in 1..=50 { engine.append(rid, 1, 6, Some(&entry_data)); } @@ -1841,7 +1853,7 @@ pub(crate) mod tests { ) .unwrap(); - let engine = open_with_file_system(cfg, fs).unwrap(); + let engine = Engine::open_with_file_system(cfg, fs).unwrap(); for rid in 1..25 { if existing_emptied.contains(&rid) || incoming_emptied.contains(&rid) { continue; @@ -1888,7 +1900,7 @@ pub(crate) mod tests { }; let fs = Arc::new(ObfuscatedFileSystem::default()); - let engine = open_with_file_system(cfg.clone(), fs.clone()).unwrap(); + let engine = Engine::open_with_file_system(cfg.clone(), fs.clone()).unwrap(); for rid in 1..=50 { engine.append(rid, 1, 6, Some(&entry_data)); } @@ -1909,11 +1921,11 @@ pub(crate) mod tests { // Corrupt a log batch. f.set_len(f.metadata().unwrap().len() - 1).unwrap(); - open_with_file_system(cfg.clone(), fs.clone()).unwrap(); + Engine::open_with_file_system(cfg.clone(), fs.clone()).unwrap(); // Corrupt the file header. f.set_len(1).unwrap(); - open_with_file_system(cfg, fs).unwrap(); + Engine::open_with_file_system(cfg, fs).unwrap(); } #[test] @@ -1930,7 +1942,7 @@ pub(crate) mod tests { }; let fs = Arc::new(ObfuscatedFileSystem::default()); - let engine = open_with_file_system(cfg.clone(), fs.clone()).unwrap(); + let engine = Engine::open_with_file_system(cfg.clone(), fs.clone()).unwrap(); for rid in 1..=10 { engine.append(rid, 1, 11, Some(&entry_data)); } @@ -1938,7 +1950,7 @@ pub(crate) mod tests { assert!(RaftLogEngine::open(cfg.clone()).is_err()); - let engine = open_with_file_system(cfg, fs).unwrap(); + let engine = Engine::open_with_file_system(cfg, fs).unwrap(); for rid in 1..10 { engine.scan_entries(rid, 1, 11, |_, _, d| { assert_eq!(d, &entry_data); @@ -1990,7 +2002,7 @@ pub(crate) mod tests { let fs = Arc::new(ObfuscatedFileSystem::default()); let rid = 1; - let engine = open_with_file_system(cfg, fs).unwrap(); + let engine = Engine::open_with_file_system(cfg, fs).unwrap(); assert!(engine.is_empty()); engine.append(rid, 1, 11, Some(&entry_data)); assert!(!engine.is_empty()); @@ -2127,7 +2139,7 @@ pub(crate) mod tests { ..Default::default() }; let fs = Arc::new(DeleteMonitoredFileSystem::new()); - let engine = open_with_file_system(cfg, fs.clone()).unwrap(); + let engine = Engine::open_with_file_system(cfg, fs.clone()).unwrap(); for rid in 1..=10 { engine.append(rid, 1, 11, Some(&entry_data)); } @@ -2184,7 +2196,7 @@ pub(crate) mod tests { ..Default::default() }; let fs = Arc::new(DeleteMonitoredFileSystem::new()); - let engine = open_with_file_system(cfg, fs.clone()).unwrap(); + let engine = Engine::open_with_file_system(cfg, fs.clone()).unwrap(); let reserved_start = *fs.reserved_metadata.lock().unwrap().first().unwrap(); for rid in 1..=10 { @@ -2292,14 +2304,14 @@ pub(crate) mod tests { assert!(cfg_v2.recycle_capacity() > 0); // Prepare files with format_version V1 { - let engine = open_with_file_system(cfg_v1.clone(), fs.clone()).unwrap(); + let engine = Engine::open_with_file_system(cfg_v1.clone(), fs.clone()).unwrap(); for rid in 1..=10 { engine.append(rid, 1, 11, Some(&entry_data)); } } // Reopen the Engine with V2 and purge { - let engine = open_with_file_system(cfg_v2.clone(), fs.clone()).unwrap(); + let engine = Engine::open_with_file_system(cfg_v2.clone(), fs.clone()).unwrap(); let (start, _) = engine.file_span(LogQueue::Append); for rid in 6..=10 { engine.append(rid, 11, 20, Some(&entry_data)); @@ -2313,7 +2325,7 @@ pub(crate) mod tests { } // Reopen the Engine with V1 -> V2 and purge { - let engine = open_with_file_system(cfg_v1, fs.clone()).unwrap(); + let engine = Engine::open_with_file_system(cfg_v1, fs.clone()).unwrap(); let (start, _) = engine.file_span(LogQueue::Append); for rid in 6..=10 { engine.append(rid, 20, 30, Some(&entry_data)); @@ -2327,7 +2339,7 @@ pub(crate) mod tests { assert_eq!(engine.file_span(LogQueue::Append).0, start); let file_count = engine.file_count(Some(LogQueue::Append)); drop(engine); - let engine = open_with_file_system(cfg_v2, fs).unwrap(); + let engine = Engine::open_with_file_system(cfg_v2, fs).unwrap(); assert_eq!(engine.file_span(LogQueue::Append).0, start); assert_eq!(engine.file_count(Some(LogQueue::Append)), file_count); // Mark all regions obsolete. @@ -2358,7 +2370,7 @@ pub(crate) mod tests { enable_log_recycle: false, ..Default::default() }; - let engine = open_with_file_system(cfg, file_system.clone()).unwrap(); + let engine = Engine::open_with_file_system(cfg, file_system.clone()).unwrap(); let (start, _) = engine.file_span(LogQueue::Append); // Only one valid file left, the last one => active_file. assert_eq!(engine.file_count(Some(LogQueue::Append)), 1); @@ -2380,7 +2392,7 @@ pub(crate) mod tests { prefill_for_recycle: true, ..Default::default() }; - let engine = open_with_file_system(cfg.clone(), file_system.clone()).unwrap(); + let engine = Engine::open_with_file_system(cfg.clone(), file_system.clone()).unwrap(); let (start, end) = engine.file_span(LogQueue::Append); // Only one valid file left, the last one => active_file. assert_eq!(start, end); @@ -2403,7 +2415,7 @@ pub(crate) mod tests { purge_threshold: ReadableSize(50), ..cfg }; - let engine = open_with_file_system(cfg_v2.clone(), file_system.clone()).unwrap(); + let engine = Engine::open_with_file_system(cfg_v2.clone(), file_system.clone()).unwrap(); assert_eq!(engine.file_span(LogQueue::Append), (start, end)); assert!(recycled_count > file_system.inner.file_count() - engine.file_count(None)); // Recycled files have filled the LogQueue::Append, purge_expired_files won't @@ -2427,7 +2439,7 @@ pub(crate) mod tests { prefill_for_recycle: false, ..cfg_v2 }; - let engine = open_with_file_system(cfg_v3, file_system.clone()).unwrap(); + let engine = Engine::open_with_file_system(cfg_v3, file_system.clone()).unwrap(); assert_eq!(file_system.inner.file_count(), engine.file_count(None)); } @@ -2448,7 +2460,7 @@ pub(crate) mod tests { let key = vec![b'x'; 2]; let value = vec![b'y'; 8]; - let engine = open_with_file_system(cfg, fs).unwrap(); + let engine = Engine::open_with_file_system(cfg, fs).unwrap(); let mut data = HashSet::new(); let mut rid = 1; // Directly write to pipe log. @@ -2589,7 +2601,7 @@ pub(crate) mod tests { ..Default::default() }; let fs = Arc::new(ObfuscatedFileSystem::default()); - let engine = open_with_file_system(cfg, fs).unwrap(); + let engine = Engine::open_with_file_system(cfg, fs).unwrap(); let value = vec![b'y'; 8]; let mut log_batch = LogBatch::default(); log_batch.put_unchecked(1, crate::make_internal_key(&[1]), value.clone()); @@ -2665,7 +2677,7 @@ pub(crate) mod tests { }; // Step 1: write data into the main directory. - let engine = open_with_file_system(cfg.clone(), file_system.clone()).unwrap(); + let engine = Engine::open_with_file_system(cfg.clone(), file_system.clone()).unwrap(); for rid in 1..=10 { engine.append(rid, 1, 10, Some(&entry_data)); } @@ -2679,7 +2691,7 @@ pub(crate) mod tests { purge_threshold: ReadableSize(40), ..cfg }; - let engine = open_with_file_system(cfg_2, file_system).unwrap(); + let engine = Engine::open_with_file_system(cfg_2, file_system).unwrap(); assert_eq!(number_of_files(sec_dir.path()), number_of_files(dir.path())); for rid in 1..=10 { assert_eq!(engine.first_index(rid).unwrap(), 1); @@ -2728,7 +2740,7 @@ pub(crate) mod tests { }; // Step 1: write data into the main directory. - let engine = open_with_file_system(cfg.clone(), file_system.clone()).unwrap(); + let engine = Engine::open_with_file_system(cfg.clone(), file_system.clone()).unwrap(); for rid in 1..=10 { engine.append(rid, 1, 10, Some(&entry_data)); } @@ -2754,7 +2766,7 @@ pub(crate) mod tests { // abnormal case - Empty second dir { std::fs::remove_dir_all(sec_dir.path()).unwrap(); - let engine = open_with_file_system(cfg.clone(), file_system.clone()).unwrap(); + let engine = Engine::open_with_file_system(cfg.clone(), file_system.clone()).unwrap(); assert_eq!(number_of_files(sec_dir.path()), number_of_files(dir.path())); } // abnormal case - Missing some append files in second dir @@ -2775,7 +2787,7 @@ pub(crate) mod tests { file_count += 1; } } - let engine = open_with_file_system(cfg.clone(), file_system.clone()).unwrap(); + let engine = Engine::open_with_file_system(cfg.clone(), file_system.clone()).unwrap(); assert_eq!(number_of_files(sec_dir.path()), number_of_files(dir.path())); } // abnormal case - Missing some rewrite files in second dir @@ -2796,7 +2808,7 @@ pub(crate) mod tests { file_count += 1; } } - let engine = open_with_file_system(cfg, file_system).unwrap(); + let engine = Engine::open_with_file_system(cfg, file_system).unwrap(); assert_eq!(number_of_files(sec_dir.path()), number_of_files(dir.path())); } // abnormal case - Missing some reserve files in second dir @@ -2828,7 +2840,7 @@ pub(crate) mod tests { }; { // Step 1: write data into the main directory. - let engine = open_with_file_system(cfg.clone(), file_system.clone()).unwrap(); + let engine = Engine::open_with_file_system(cfg.clone(), file_system.clone()).unwrap(); for rid in 1..=10 { engine.append(rid, 1, 10, Some(&entry_data)); } @@ -2864,7 +2876,7 @@ pub(crate) mod tests { purge_threshold: ReadableSize(40), ..cfg.clone() }; - let engine = open_with_file_system(cfg_2, file_system.clone()).unwrap(); + let engine = Engine::open_with_file_system(cfg_2, file_system.clone()).unwrap(); assert!(number_of_files(spill_dir.path()) > 0); for rid in 1..=10 { assert_eq!(engine.first_index(rid).unwrap(), 1); @@ -2891,7 +2903,7 @@ pub(crate) mod tests { ..cfg }; drop(engine); - let engine = open_with_file_system(cfg_3, file_system).unwrap(); + let engine = Engine::open_with_file_system(cfg_3, file_system).unwrap(); assert!(number_of_files(spill_dir.path()) > 0); for rid in 1..=10 { assert_eq!(engine.first_index(rid).unwrap(), 20); diff --git a/src/env/default.rs b/src/env/default.rs index 7de4c0de..9b5084db 100644 --- a/src/env/default.rs +++ b/src/env/default.rs @@ -5,6 +5,7 @@ use std::os::unix::io::RawFd; use std::path::Path; use std::sync::Arc; +use crate::env::RecoverExt; use fail::fail_point; use log::error; use nix::errno::Errno; @@ -274,6 +275,8 @@ impl WriteExt for LogFile { #[derive(Clone)] pub struct DefaultFileSystem; +impl RecoverExt for DefaultFileSystem {} + impl FileSystem for DefaultFileSystem { type Handle = LogFd; type Reader = LogFile; diff --git a/src/env/double_write.rs b/src/env/double_write.rs index 125b7195..5e1dc29f 100644 --- a/src/env/double_write.rs +++ b/src/env/double_write.rs @@ -1,5 +1,6 @@ // Copyright (c) 2017-present, PingCAP, Inc. Licensed under Apache-2.0. +use crate::env::RecoverExt; use crate::file_pipe_log::log_file::build_file_reader; use crate::file_pipe_log::pipe_builder::FileName; use crate::file_pipe_log::reader::LogItemBatchFileReader; @@ -12,31 +13,27 @@ use crossbeam::channel::unbounded; use crossbeam::channel::Sender; use fail::fail_point; use log::{info, warn}; -use prometheus::core::Atomic; -use std::cell::Cell; +use std::cell::UnsafeCell; use std::fs; -use std::io::{Read, Result as IoResult, Seek, SeekFrom, Write}; +use std::io::{Error as IoError, Read, Result as IoResult, Seek, SeekFrom, Write}; use std::path::Path; use std::path::PathBuf; use std::sync::atomic::AtomicU64; use std::sync::atomic::Ordering; use std::sync::Arc; -use std::sync::Mutex; -use std::sync::RwLock; use std::thread; use crate::env::default::LogFd; use crate::env::DefaultFileSystem; use crate::env::{FileSystem, Handle, Permission, WriteExt}; +use futures::channel::oneshot; use futures::executor::block_on; use futures::select; -use futures::{channel::oneshot, Future}; use either::Either; type Callback = Box) + Send>; -#[derive(PartialEq)] enum Task { Create(PathBuf), Open { @@ -48,35 +45,30 @@ enum Task { src_path: PathBuf, dst_path: PathBuf, }, - Stop, -} - -enum TaskRes { - Create(LogFd), - Open(LogFd), - Delete, - Rename, -} - -#[derive(PartialEq, Clone)] -enum HandleTask { Truncate { + handle: Arc, offset: usize, }, - FileSize, - Sync, + FileSize(Arc), + Sync(Arc), Write { + handle: Arc, offset: usize, bytes: Vec, }, Allocate { + handle: Arc, offset: usize, size: usize, }, Stop, } -enum HandleTaskRes { +enum TaskRes { + Create(LogFd), + Open(LogFd), + Delete, + Rename, Truncate, FileSize(usize), Sync, @@ -101,7 +93,7 @@ fn replace_path(path: &Path, from: &Path, to: &Path) -> PathBuf { } pub struct HedgedFileSystem { - base: DefaultFileSystem, + base: Arc, path1: PathBuf, path2: PathBuf, @@ -119,7 +111,7 @@ pub struct HedgedFileSystem { // disks TODO: consider encryption impl HedgedFileSystem { - pub fn new(base: DefaultFileSystem, path1: PathBuf, path2: PathBuf) -> Self { + pub fn new(base: Arc, path1: PathBuf, path2: PathBuf) -> Self { let (tx1, rx1) = unbounded::<(Task, Callback)>(); let (tx2, rx2) = unbounded::<(Task, Callback)>(); let counter1 = Arc::new(AtomicU64::new(0)); @@ -162,7 +154,7 @@ impl HedgedFileSystem { } } - fn catch_up_diff(&self, fromFiles: Files, toFiles: Files) -> Result<()> { + fn catch_up_diff(&self, fromFiles: Files, toFiles: Files) -> IoResult<()> { let check_files = |from: &Vec, to: &Vec| -> IoResult<()> { let mut iter1 = from.iter().peekable(); let mut iter2 = to.iter().peekable(); @@ -306,13 +298,13 @@ impl HedgedFileSystem { Ok(files) } - fn get_latest_valid_seq(&self, files: &Files) -> Result { + fn get_latest_valid_seq(&self, files: &Files) -> IoResult { let mut count = 0; if let Some(f) = files.append_file.last() { let recovery_read_block_size = 1024; let mut reader = LogItemBatchFileReader::new(recovery_read_block_size); let handle = Arc::new(self.base.open(&f.path, Permission::ReadOnly)?); - let file_reader = build_file_reader(&self.base, handle)?; + let file_reader = build_file_reader(self.base.as_ref(), handle)?; match reader.open( FileId { queue: LogQueue::Append, @@ -320,10 +312,10 @@ impl HedgedFileSystem { }, file_reader, ) { - Err(e) if matches!(e, Error::Io(_)) => return Err(e), - Err(e) => { - return Ok(0); - } + Err(e) => match e { + Error::Io(err) => return Err(err), + _ => return Ok(0), + }, Ok(format) => { // Do nothing } @@ -384,6 +376,27 @@ impl HedgedFileSystem { Task::Rename { src_path, dst_path } => file_system .rename(src_path, dst_path) .map(|_| TaskRes::Rename), + Task::Truncate { handle, offset } => { + handle.get().truncate(offset).map(|_| TaskRes::Truncate) + } + Task::FileSize(handle) => handle.get().file_size().map(|s| TaskRes::FileSize(s)), + Task::Sync(handle) => handle.get().sync().map(|_| TaskRes::Sync), + Task::Write { + handle, + offset, + bytes, + } => handle + .get() + .write(offset, &bytes) + .map(|s| TaskRes::Write(s)), + Task::Allocate { + handle, + offset, + size, + } => handle + .get() + .allocate(offset, size) + .map(|_| TaskRes::Allocate), Task::Stop => unreachable!(), } } @@ -399,7 +412,7 @@ impl Drop for HedgedFileSystem { } impl RecoverExt for HedgedFileSystem { - fn bootstrap(&self) -> Result<()> { + fn bootstrap(&self) -> IoResult<()> { // catch up diff let files1 = self.get_files(&self.path1)?; let files2 = self.get_files(&self.path2)?; @@ -424,13 +437,14 @@ impl RecoverExt for HedgedFileSystem { } fn need_recover(&self) -> bool { + false } fn is_in_recover(&self) -> bool { false } - fn trigger_recover(&self) { + fn trigger_recover(&self) { () } } @@ -498,24 +512,36 @@ impl FileSystem for HedgedFileSystem { } pub struct FutureHandle { - inner: Either>, Arc>, + inner: UnsafeCell>, Arc>>, } +unsafe impl Send for FutureHandle {} + +// To avoid using `Mutex` +// Safety: +// For write, all writes are serialized to one channel, so only one thread will +// update the inner. For read, multiple readers and one writer and may visit +// try_get() concurrently to get the fd from receiver. The receiver is `Sync`, +// so only one of them will get the fd, and update the inner to Arc. +unsafe impl Sync for FutureHandle {} + impl FutureHandle { fn new(rx: oneshot::Receiver>) -> Self { Self { - inner: Either::Left(rx), + inner: UnsafeCell::new(Either::Left(rx)), } } fn new_owned(h: LogFd) -> Self { Self { - inner: Either::Right(Arc::new(h)), + inner: UnsafeCell::new(Either::Right(Arc::new(h))), } } - fn get(self) -> Arc { - let fd = match self.inner { + fn get(&self) -> Arc { + let mut set = false; + let fd = match unsafe { &mut *self.inner.get() } { Either::Left(rx) => { + set = true; // TODO: should we handle the second disk io error match block_on(rx).unwrap().unwrap() { TaskRes::Open(fd) => Arc::new(fd), @@ -523,107 +549,69 @@ impl FutureHandle { _ => unreachable!(), } } - Either::Right(w) => w, + Either::Right(w) => w.clone(), }; + if set { + unsafe { + *self.inner.get() = Either::Right(fd.clone()); + } + } fd } - // fn try_get(&self) -> Option> { - // let mut set = false; - // let fd = match self.inner { - // Either::Left(rx) => { - // set = true; - // // TODO: should we handle the second disk io error - // match rx.try_recv().unwrap() { - // None => return None, - // Some(Err(_)) => panic!(), - // Some(Ok(TaskRes::Open(fd))) => Arc::new(fd), - // Some(Ok(TaskRes::Create(fd))) => Arc::new(fd), - // _ => unreachable!(), - // } - // } - // Either::Right(w) => w.clone(), - // }; - // if set { - // self.inner = Either::Right(fd.clone()); - // } - // Some(fd) - // } + fn try_get(&self) -> Option> { + let mut set = false; + let fd = match unsafe { &mut *self.inner.get() } { + Either::Left(rx) => { + set = true; + // TODO: should we handle the second disk io error + match rx.try_recv().unwrap() { + None => return None, + Some(Err(_)) => panic!(), + Some(Ok(TaskRes::Open(fd))) => Arc::new(fd), + Some(Ok(TaskRes::Create(fd))) => Arc::new(fd), + _ => unreachable!(), + } + } + Either::Right(w) => w.clone(), + }; + if set { + unsafe { + *self.inner.get() = Either::Right(fd.clone()); + } + } + Some(fd) + } } pub struct HedgedHandle { - disk1: Sender<(HandleTask, Callback)>, - disk2: Sender<(HandleTask, Callback)>, + disk1: Sender<(Task, Callback)>, + disk2: Sender<(Task, Callback)>, + + handle1: Arc, + handle2: Arc, + counter1: Arc, counter2: Arc, - fd1: Arc>>>, - fd2: Arc>>>, - - t1: Option>, - t2: Option>, } impl HedgedHandle { - pub fn new(handle1: FutureHandle, handle2: FutureHandle, counter1: Arc, counter2: Arc ) -> Self { - let (tx1, rx1) = unbounded::<(HandleTask, Callback)>(); - let (tx2, rx2) = unbounded::<(HandleTask, Callback)>(); - let counter1 = Arc::new(AtomicU64::new(0)); - let counter2 = Arc::new(AtomicU64::new(0)); - let fd1 = Arc::new(RwLock::new(None)); - let fd2 = Arc::new(RwLock::new(None)); - - let t1 = { - let fd1 = fd1.clone(); - let counter1 = counter1.clone(); - thread::spawn(move || { - let fd = handle1.get(); - fd1.write().unwrap().replace(fd.clone()); - for (task, cb) in rx1 { - if task == HandleTask::Stop { - break; - } - let res = Self::handle(&fd, task); - counter1.fetch_add(1, Ordering::Relaxed); - cb(res); - } - }) - }; - let t2 = { - let fd2 = fd2.clone(); - let counter2 = counter2.clone(); - thread::spawn(move || { - let fd = handle2.get(); - fd2.write().unwrap().replace(fd.clone()); - for (task, cb) in rx2 { - if task == HandleTask::Stop { - break; - } - let res = Self::handle(&fd, task); - counter2.fetch_add(1, Ordering::Relaxed); - cb(res); - } - }) - }; + pub fn new( + handle1: FutureHandle, + handle2: FutureHandle, + counter1: Arc, + counter2: Arc, + ) -> Self { + let (tx1, rx1) = unbounded::<(Task, Callback)>(); + let (tx2, rx2) = unbounded::<(Task, Callback)>(); + Self { disk1: tx1, disk2: tx2, + handle1: Arc::new(handle1), + handle2: Arc::new(handle2), counter1, counter2, - fd1, - fd2, - t1: Some(t1), - t2: Some(t2), - } - } - - fn handle(fd: &LogFd, task: HandleTask) -> IoResult { - match task { - HandleTask::Truncate{offset} => fd.truncate(offset).map(|_| HandleTaskRes::Truncate), - HandleTask::FileSize => fd.file_size().map(|s| HandleTaskRes::FileSize(s)), - HandleTask::Sync => fd.sync().map(|_| HandleTaskRes::Sync), - HandleTask::Write { offset, bytes } => fd.write(offset, &bytes).map(|s| HandleTaskRes::Write(s)), - HandleTask::Allocate { offset, size } => fd.allocate(offset, size).map(|_| HandleTaskRes::Allocate), - HandleTask::Stop => unreachable!(), } } @@ -634,41 +622,62 @@ impl HedgedHandle { let count2 = self.counter2.load(Ordering::Relaxed); match count1.cmp(&count2) { std::cmp::Ordering::Equal => { - if let Some(fd) = self.fd1.read().unwrap().as_ref() { + if let Some(fd) = self.handle1.try_get() { fd.read(offset, buf) - } else if let Some(fd) = self.fd2.read().unwrap().as_ref() { + } else if let Some(fd) = self.handle2.try_get() { fd.read(offset, buf) } else { panic!("Both fd1 and fd2 are None"); } } - std::cmp::Ordering::Greater => { - self.fd1.read().unwrap().as_ref().unwrap().read(offset, buf) - } - std::cmp::Ordering::Less => { - self.fd2.read().unwrap().as_ref().unwrap().read(offset, buf) - } + std::cmp::Ordering::Greater => self.handle1.try_get().unwrap().read(offset, buf), + std::cmp::Ordering::Less => self.handle2.try_get().unwrap().read(offset, buf), } } fn write(&self, offset: usize, content: &[u8]) -> IoResult { - block_on(self.wait_one(HandleTask::Write { - offset, - bytes: content.to_vec(), - })) - .map(|res| if let HandleTaskRes::Write(size) = res { size } else { unreachable!() }) + block_on(self.wait_one( + Task::Write { + handle: self.handle1.clone(), + offset, + bytes: content.to_vec(), + }, + Task::Write { + handle: self.handle2.clone(), + offset, + bytes: content.to_vec(), + }, + )) + .map(|res| { + if let TaskRes::Write(size) = res { + size + } else { + unreachable!() + } + }) } fn allocate(&self, offset: usize, size: usize) -> IoResult<()> { - block_on(self.wait_one(HandleTask::Allocate { offset, size })) + block_on(self.wait_one( + Task::Allocate { + handle: self.handle1.clone(), + offset, + size, + }, + Task::Allocate { + handle: self.handle2.clone(), + offset, + size, + }, + )) .map(|_| ()) } - async fn wait_one(&self, task: HandleTask) -> IoResult { + async fn wait_one(&self, task1: Task, task2: Task) -> IoResult { let (cb1, mut f1) = paired_future_callback(); let (cb2, mut f2) = paired_future_callback(); - self.disk1.send((task.clone(), cb1)).unwrap(); - self.disk2.send((task, cb2)).unwrap(); + self.disk1.send((task1, cb1)).unwrap(); + self.disk2.send((task2, cb2)).unwrap(); select! { res1 = f1 => res1.unwrap(), @@ -677,26 +686,41 @@ impl HedgedHandle { } } -impl Drop for HedgedHandle { - fn drop(&mut self) { - self.disk1.send((HandleTask::Stop, Box::new(|_| {}))).unwrap(); - self.disk2.send((HandleTask::Stop, Box::new(|_| {}))).unwrap(); - self.t1.take().unwrap().join().unwrap(); - self.t2.take().unwrap().join().unwrap(); - } -} - impl Handle for HedgedHandle { fn truncate(&self, offset: usize) -> IoResult<()> { - block_on(self.wait_one(HandleTask::Truncate{offset})).map(|_| ()) + block_on(self.wait_one( + Task::Truncate { + handle: self.handle1.clone(), + offset, + }, + Task::Truncate { + handle: self.handle2.clone(), + offset, + }, + )) + .map(|_| ()) } fn file_size(&self) -> IoResult { - block_on(self.wait_one(HandleTask::FileSize)).map(|res| if let HandleTaskRes::FileSize(size) = res { size } else { unreachable!() }) + block_on(self.wait_one( + Task::FileSize(self.handle1.clone()), + Task::FileSize(self.handle2.clone()), + )) + .map(|res| { + if let TaskRes::FileSize(size) = res { + size + } else { + unreachable!() + } + }) } fn sync(&self) -> IoResult<()> { - block_on(self.wait_one(HandleTask::Sync)).map(|_| ()) + block_on(self.wait_one( + Task::Sync(self.handle1.clone()), + Task::Sync(self.handle2.clone()), + )) + .map(|_| ()) } } diff --git a/src/env/mod.rs b/src/env/mod.rs index 25e1707b..d618cedb 100644 --- a/src/env/mod.rs +++ b/src/env/mod.rs @@ -66,7 +66,7 @@ pub trait FileSystem: Send + Sync + 'static + RecoverExt { } pub trait RecoverExt { - fn bootstrap() -> Result<()> { + fn bootstrap(&self) -> Result<()> { Ok(()) } @@ -78,7 +78,7 @@ pub trait RecoverExt { false } - fn trigger_recover(&self) { + fn trigger_recover(&self) { () } } diff --git a/src/env/obfuscated.rs b/src/env/obfuscated.rs index 6adaf277..d350f4df 100644 --- a/src/env/obfuscated.rs +++ b/src/env/obfuscated.rs @@ -5,6 +5,7 @@ use std::path::Path; use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::Arc; +use crate::env::RecoverExt; use crate::env::{DefaultFileSystem, FileSystem, Permission, WriteExt}; pub struct ObfuscatedReader(::Reader); @@ -85,6 +86,8 @@ impl ObfuscatedFileSystem { } } +impl RecoverExt for ObfuscatedFileSystem {} + impl FileSystem for ObfuscatedFileSystem { type Handle = ::Handle; type Reader = ObfuscatedReader; diff --git a/src/file_pipe_log/log_file.rs b/src/file_pipe_log/log_file.rs index 3e73e247..572e8235 100644 --- a/src/file_pipe_log/log_file.rs +++ b/src/file_pipe_log/log_file.rs @@ -30,7 +30,7 @@ pub(super) fn build_file_writer( handle: Arc, format: LogFileFormat, force_reset: bool, -) -> Result> { +) -> IoResult> { let writer = system.new_writer(handle.clone())?; LogFileWriter::open(handle, writer, format, force_reset) } @@ -49,7 +49,7 @@ impl LogFileWriter { writer: F::Writer, format: LogFileFormat, force_reset: bool, - ) -> Result { + ) -> IoResult { let file_size = handle.file_size()?; let mut f = Self { handle, @@ -133,7 +133,7 @@ impl LogFileWriter { pub(crate) fn build_file_reader( system: &F, handle: Arc, -) -> Result> { +) -> IoResult> { let reader = system.new_reader(handle.clone())?; Ok(LogFileReader::open(handle, reader)) } diff --git a/src/file_pipe_log/mod.rs b/src/file_pipe_log/mod.rs index 0b036013..0ff8c02a 100644 --- a/src/file_pipe_log/mod.rs +++ b/src/file_pipe_log/mod.rs @@ -31,6 +31,7 @@ pub mod debug { use super::format::{FileNameExt, LogFileFormat}; use super::log_file::{LogFileReader, LogFileWriter}; use super::reader::LogItemBatchFileReader; + use std::io::Result as IoResult; /// Opens a log file for write. When `create` is true, the specified file /// will be created first if not exists. @@ -40,7 +41,7 @@ pub mod debug { path: &Path, format: LogFileFormat, create: bool, - ) -> Result> { + ) -> IoResult> { let fd = if create { file_system.create(path)? } else { @@ -54,7 +55,7 @@ pub mod debug { pub fn build_file_reader( file_system: &F, path: &Path, - ) -> Result> { + ) -> IoResult> { let fd = Arc::new(file_system.open(path, Permission::ReadOnly)?); super::log_file::build_file_reader(file_system, fd) } diff --git a/src/file_pipe_log/pipe.rs b/src/file_pipe_log/pipe.rs index bbec7bc9..f4eb2260 100644 --- a/src/file_pipe_log/pipe.rs +++ b/src/file_pipe_log/pipe.rs @@ -513,11 +513,11 @@ impl PipeLog for DualPipes { queue: LogQueue, bytes: &mut T, ) -> Result { - if self.file_system.need_recover() { - self.pipes[LogQueue::append].rotate(); - self.pipes[LogQueue::rewrite].rotate(); - self.file_system.trigger_recover(); - } + // if self.file_system.need_recover() { + // self.pipes[LogQueue::Append].rotate(); + // self.pipes[LogQueue::Rewrite].rotate(); + // self.file_system.trigger_recover(); + // } self.pipes[queue as usize].append(bytes) } diff --git a/src/memtable.rs b/src/memtable.rs index a667ce24..d46ba68b 100644 --- a/src/memtable.rs +++ b/src/memtable.rs @@ -1060,7 +1060,7 @@ impl MemTableAccessor { raft == 3, |_| {} ); - match item.content + match item.content { LogItemContent::EntryIndexes(entries_to_add) => { memtable.write().append(entries_to_add.0); } diff --git a/src/purge.rs b/src/purge.rs index a4c220cd..8a6e65d8 100644 --- a/src/purge.rs +++ b/src/purge.rs @@ -75,10 +75,10 @@ where pub fn purge_expired_files(&self) -> Result> { let _t = StopWatch::new(&*ENGINE_PURGE_DURATION_HISTOGRAM); - if self.file_system().is_in_recover() { - info!("skip purge due to in recover"); - return Ok(vec![]); - } + // if self.file_system().is_in_recover() { + // info!("skip purge due to in recover"); + // return Ok(vec![]); + // } let guard = self.force_rewrite_candidates.try_lock(); if guard.is_none() { warn!("Unable to purge expired files: locked"); From 928dd70dd80d4bdaa88eb9cf63897618b68ccfe7 Mon Sep 17 00:00:00 2001 From: Connor1996 Date: Thu, 31 Aug 2023 14:53:45 +0800 Subject: [PATCH 09/32] make send to two disks atomic Signed-off-by: Connor1996 --- src/env/double_write.rs | 59 ++++++++++++++++++++++++++--------------- 1 file changed, 37 insertions(+), 22 deletions(-) diff --git a/src/env/double_write.rs b/src/env/double_write.rs index 5e1dc29f..df973f3e 100644 --- a/src/env/double_write.rs +++ b/src/env/double_write.rs @@ -22,6 +22,7 @@ use std::sync::atomic::AtomicU64; use std::sync::atomic::Ordering; use std::sync::Arc; use std::thread; +use std::sync::Mutex; use crate::env::default::LogFd; use crate::env::DefaultFileSystem; @@ -92,13 +93,35 @@ fn replace_path(path: &Path, from: &Path, to: &Path) -> PathBuf { } } + +// Make sure the task is sent to two disks' channel atomically, otherwise the ordering of the tasks in two disks are not same. +#[derive(Clone)] +struct HedgedSender(Arc>); + +struct HedgedSenderInner { + disk1: Sender<(Task, Callback)>, + disk2: Sender<(Task, Callback)>, +} + +impl HedgedSender { + fn new(disk1: Sender<(Task, Callback)>, disk2: Sender<(Task, Callback)>) -> Self { + Self(Arc::new(Mutex::new(HedgedSenderInner { disk1, disk2 }))) + } + + fn send(&self, task1: Task, task2: Task, cb1: Callback, cb2: Callback) { + let mut inner = self.0.lock().unwrap(); + inner.disk1.send((task1, cb1)).unwrap(); + inner.disk2.send((task2, cb2)).unwrap(); + } +} + pub struct HedgedFileSystem { base: Arc, path1: PathBuf, path2: PathBuf, - disk1: Sender<(Task, Callback)>, - disk2: Sender<(Task, Callback)>, + + sender: HedgedSender, counter1: Arc, counter2: Arc, @@ -141,12 +164,12 @@ impl HedgedFileSystem { counter2_clone.fetch_add(1, Ordering::Relaxed); } }); + let sender = HedgedSender::new(tx1, tx2); Self { base, path1, path2, - disk1: tx1, - disk2: tx2, + sender, counter1, counter2, handle1: Some(handle1), @@ -337,8 +360,7 @@ impl HedgedFileSystem { async fn wait_handle(&self, task1: Task, task2: Task) -> IoResult { let (cb1, mut f1) = paired_future_callback(); let (cb2, mut f2) = paired_future_callback(); - self.disk1.send((task1, cb1)).unwrap(); - self.disk2.send((task2, cb2)).unwrap(); + self.sender.send(task1, task2, cb1, cb2); let resolve = |res: TaskRes| -> LogFd { match res { @@ -348,9 +370,9 @@ impl HedgedFileSystem { } }; select! { - res1 = f1 => res1.unwrap().map(|res| HedgedHandle::new( + res1 = f1 => res1.unwrap().map(|res| HedgedHandle::new(self.sender.clone(), FutureHandle::new_owned(resolve(res)), FutureHandle::new(f2) , self.counter1.clone(), self.counter2.clone())), - res2 = f2 => res2.unwrap().map(|res| HedgedHandle::new( + res2 = f2 => res2.unwrap().map(|res| HedgedHandle::new(self.sender.clone(), FutureHandle::new(f1), FutureHandle::new_owned(resolve(res)) , self.counter1.clone(), self.counter2.clone())), } } @@ -358,8 +380,7 @@ impl HedgedFileSystem { async fn wait_one(&self, task1: Task, task2: Task) -> IoResult<()> { let (cb1, mut f1) = paired_future_callback(); let (cb2, mut f2) = paired_future_callback(); - self.disk1.send((task1, cb1)).unwrap(); - self.disk2.send((task2, cb2)).unwrap(); + self.sender.send(task1, task2, cb1, cb2) ; select! { res1 = f1 => res1.unwrap().map(|_| ()), @@ -404,8 +425,7 @@ impl HedgedFileSystem { impl Drop for HedgedFileSystem { fn drop(&mut self) { - self.disk1.send((Task::Stop, Box::new(|_| {}))).unwrap(); - self.disk2.send((Task::Stop, Box::new(|_| {}))).unwrap(); + self.sender.send(Task::Stop, Task::Stop, Box::new(|_| {}), Box::new(|_| {})); self.handle1.take().unwrap().join().unwrap(); self.handle2.take().unwrap().join().unwrap(); } @@ -585,8 +605,7 @@ impl FutureHandle { } pub struct HedgedHandle { - disk1: Sender<(Task, Callback)>, - disk2: Sender<(Task, Callback)>, + sender: HedgedSender, handle1: Arc, handle2: Arc, @@ -596,18 +615,15 @@ pub struct HedgedHandle { } impl HedgedHandle { - pub fn new( + fn new( + sender: HedgedSender, handle1: FutureHandle, handle2: FutureHandle, counter1: Arc, counter2: Arc, ) -> Self { - let (tx1, rx1) = unbounded::<(Task, Callback)>(); - let (tx2, rx2) = unbounded::<(Task, Callback)>(); - Self { - disk1: tx1, - disk2: tx2, + sender, handle1: Arc::new(handle1), handle2: Arc::new(handle2), counter1, @@ -676,8 +692,7 @@ impl HedgedHandle { async fn wait_one(&self, task1: Task, task2: Task) -> IoResult { let (cb1, mut f1) = paired_future_callback(); let (cb2, mut f2) = paired_future_callback(); - self.disk1.send((task1, cb1)).unwrap(); - self.disk2.send((task2, cb2)).unwrap(); + self.sender.send(task1, task2, cb1, cb2) ; select! { res1 = f1 => res1.unwrap(), From 67542991c775c5f12a2bc37731e094765a2f42c4 Mon Sep 17 00:00:00 2001 From: Connor1996 Date: Thu, 31 Aug 2023 15:01:11 +0800 Subject: [PATCH 10/32] make format Signed-off-by: Connor1996 --- src/engine.rs | 2 +- src/env/double_write.rs | 18 +++++++++++------- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index d964ba64..b36d8287 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -118,7 +118,7 @@ where ) -> Result>> { Engine::open_with(cfg, file_system, vec![]) } - + fn open_with( mut cfg: Config, file_system: Arc, diff --git a/src/env/double_write.rs b/src/env/double_write.rs index df973f3e..c5ef1fde 100644 --- a/src/env/double_write.rs +++ b/src/env/double_write.rs @@ -21,8 +21,8 @@ use std::path::PathBuf; use std::sync::atomic::AtomicU64; use std::sync::atomic::Ordering; use std::sync::Arc; -use std::thread; use std::sync::Mutex; +use std::thread; use crate::env::default::LogFd; use crate::env::DefaultFileSystem; @@ -93,8 +93,8 @@ fn replace_path(path: &Path, from: &Path, to: &Path) -> PathBuf { } } - -// Make sure the task is sent to two disks' channel atomically, otherwise the ordering of the tasks in two disks are not same. +// Make sure the task is sent to two disks' channel atomically, otherwise the +// ordering of the tasks in two disks are not same. #[derive(Clone)] struct HedgedSender(Arc>); @@ -104,7 +104,10 @@ struct HedgedSenderInner { } impl HedgedSender { - fn new(disk1: Sender<(Task, Callback)>, disk2: Sender<(Task, Callback)>) -> Self { + fn new( + disk1: Sender<(Task, Callback)>, + disk2: Sender<(Task, Callback)>, + ) -> Self { Self(Arc::new(Mutex::new(HedgedSenderInner { disk1, disk2 }))) } @@ -380,7 +383,7 @@ impl HedgedFileSystem { async fn wait_one(&self, task1: Task, task2: Task) -> IoResult<()> { let (cb1, mut f1) = paired_future_callback(); let (cb2, mut f2) = paired_future_callback(); - self.sender.send(task1, task2, cb1, cb2) ; + self.sender.send(task1, task2, cb1, cb2); select! { res1 = f1 => res1.unwrap().map(|_| ()), @@ -425,7 +428,8 @@ impl HedgedFileSystem { impl Drop for HedgedFileSystem { fn drop(&mut self) { - self.sender.send(Task::Stop, Task::Stop, Box::new(|_| {}), Box::new(|_| {})); + self.sender + .send(Task::Stop, Task::Stop, Box::new(|_| {}), Box::new(|_| {})); self.handle1.take().unwrap().join().unwrap(); self.handle2.take().unwrap().join().unwrap(); } @@ -692,7 +696,7 @@ impl HedgedHandle { async fn wait_one(&self, task1: Task, task2: Task) -> IoResult { let (cb1, mut f1) = paired_future_callback(); let (cb2, mut f2) = paired_future_callback(); - self.sender.send(task1, task2, cb1, cb2) ; + self.sender.send(task1, task2, cb1, cb2); select! { res1 = f1 => res1.unwrap(), From b523bd4cb6dd410511731a913ebf4dad5aa5623a Mon Sep 17 00:00:00 2001 From: Connor1996 Date: Thu, 31 Aug 2023 15:42:36 +0800 Subject: [PATCH 11/32] fix warning Signed-off-by: Connor1996 --- src/env/double_write.rs | 50 ++++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/src/env/double_write.rs b/src/env/double_write.rs index c5ef1fde..2971a2b3 100644 --- a/src/env/double_write.rs +++ b/src/env/double_write.rs @@ -8,14 +8,14 @@ use crate::file_pipe_log::FileNameExt; use crate::internals::parse_reserved_file_name; use crate::internals::FileId; use crate::internals::LogQueue; -use crate::{Error, Result}; +use crate::Error; use crossbeam::channel::unbounded; use crossbeam::channel::Sender; use fail::fail_point; use log::{info, warn}; use std::cell::UnsafeCell; use std::fs; -use std::io::{Error as IoError, Read, Result as IoResult, Seek, SeekFrom, Write}; +use std::io::{Read, Result as IoResult, Seek, SeekFrom, Write}; use std::path::Path; use std::path::PathBuf; use std::sync::atomic::AtomicU64; @@ -112,7 +112,7 @@ impl HedgedSender { } fn send(&self, task1: Task, task2: Task, cb1: Callback, cb2: Callback) { - let mut inner = self.0.lock().unwrap(); + let inner = self.0.lock().unwrap(); inner.disk1.send((task1, cb1)).unwrap(); inner.disk2.send((task2, cb2)).unwrap(); } @@ -180,7 +180,7 @@ impl HedgedFileSystem { } } - fn catch_up_diff(&self, fromFiles: Files, toFiles: Files) -> IoResult<()> { + fn catch_up_diff(&self, from_files: Files, to_files: Files) -> IoResult<()> { let check_files = |from: &Vec, to: &Vec| -> IoResult<()> { let mut iter1 = from.iter().peekable(); let mut iter2 = to.iter().peekable(); @@ -192,8 +192,8 @@ impl HedgedFileSystem { (Some(f1), None) => { let to = replace_path( f1.path.as_ref(), - fromFiles.prefix.as_ref(), - toFiles.prefix.as_ref(), + from_files.prefix.as_ref(), + to_files.prefix.as_ref(), ); fs::copy(&f1.path, to)?; iter1.next(); @@ -208,7 +208,7 @@ impl HedgedFileSystem { // TODO: do we need to check file size? // if f1.handle.file_size() != f2.handle.file_size() { // let to = replace_path(f1.path.as_ref(), - // fromFiles.prefix.as_ref(), toFiles.prefix.as_ref()); + // from_files.prefix.as_ref(), to_files.prefix.as_ref()); // fs::copy(&f1.path, &to)?; // } iter1.next(); @@ -217,8 +217,8 @@ impl HedgedFileSystem { std::cmp::Ordering::Less => { let to = replace_path( f1.path.as_ref(), - fromFiles.prefix.as_ref(), - toFiles.prefix.as_ref(), + from_files.prefix.as_ref(), + to_files.prefix.as_ref(), ); fs::copy(&f1.path, to)?; iter1.next(); @@ -234,34 +234,34 @@ impl HedgedFileSystem { Ok(()) }; - check_files(&fromFiles.append_file, &toFiles.append_file)?; - check_files(&fromFiles.rewrite_file, &toFiles.rewrite_file)?; - check_files(&fromFiles.recycled_file, &toFiles.recycled_file)?; + check_files(&from_files.append_file, &to_files.append_file)?; + check_files(&from_files.rewrite_file, &to_files.rewrite_file)?; + check_files(&from_files.recycled_file, &to_files.recycled_file)?; // check file size is not enough, treat the last files differently considering // the recycle, always copy the last file // TODO: only copy diff part - if let Some(last_file) = fromFiles.append_file.last() { + if let Some(last_file) = from_files.append_file.last() { let to = replace_path( last_file.path.as_ref(), - fromFiles.prefix.as_ref(), - toFiles.prefix.as_ref(), + from_files.prefix.as_ref(), + to_files.prefix.as_ref(), ); fs::copy(&last_file.path, to)?; } - if let Some(last_file) = fromFiles.rewrite_file.last() { + if let Some(last_file) = from_files.rewrite_file.last() { let to = replace_path( last_file.path.as_ref(), - fromFiles.prefix.as_ref(), - toFiles.prefix.as_ref(), + from_files.prefix.as_ref(), + to_files.prefix.as_ref(), ); fs::copy(&last_file.path, to)?; } - if let Some(last_file) = fromFiles.recycled_file.last() { + if let Some(last_file) = from_files.recycled_file.last() { let to = replace_path( last_file.path.as_ref(), - fromFiles.prefix.as_ref(), - toFiles.prefix.as_ref(), + from_files.prefix.as_ref(), + to_files.prefix.as_ref(), ); fs::copy(&last_file.path, to)?; } @@ -342,17 +342,17 @@ impl HedgedFileSystem { Error::Io(err) => return Err(err), _ => return Ok(0), }, - Ok(format) => { + Ok(_) => { // Do nothing } } loop { match reader.next() { - Ok(Some(item_batch)) => { + Ok(Some(_)) => { count += 1; } Ok(None) => break, - Err(e) => break, + Err(_) => break, } } } @@ -447,7 +447,7 @@ impl RecoverExt for HedgedFileSystem { match count1.cmp(&count2) { std::cmp::Ordering::Equal => { // still need to catch up, but only diff - self.catch_up_diff(files1, files2); + self.catch_up_diff(files1, files2)?; return Ok(()); } std::cmp::Ordering::Less => { From 340966957926917dabc531e62edae7af78056377 Mon Sep 17 00:00:00 2001 From: Connor1996 Date: Mon, 4 Sep 2023 16:15:04 +0800 Subject: [PATCH 12/32] refine tests Signed-off-by: Connor1996 --- Cargo.toml | 1 + src/engine.rs | 101 +++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 97 insertions(+), 5 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 3e9af25c..8fc6b771 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -73,6 +73,7 @@ rand = "0.8" rand_distr = "0.4" tempfile = "3.1" toml = "0.7" +md-5 = "0.10.5" [features] internals = [] diff --git a/src/engine.rs b/src/engine.rs index b36d8287..88d90096 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -2056,6 +2056,8 @@ pub(crate) mod tests { } } + impl RecoverExt for DeleteMonitoredFileSystem {} + impl FileSystem for DeleteMonitoredFileSystem { type Handle = ::Handle; type Reader = ::Reader; @@ -2650,6 +2652,30 @@ pub(crate) mod tests { r } + use md5::{Digest, Md5}; + use std::{fs, io}; + + fn calculate_hash(path: &Path) -> [u8; 16] { + let mut hasher = Md5::new(); + + std::fs::read_dir(path).unwrap().for_each(|e| { + let p = e.unwrap().path(); + let file_name = p.file_name().unwrap().to_str().unwrap(); + match FileId::parse_file_name(file_name) { + None => { + if parse_reserved_file_name(file_name).is_none() { + return; + } + } + _ => {} + } + let mut file = fs::File::open(&p).unwrap(); + let n = io::copy(&mut file, &mut hasher).unwrap(); + }); + hasher.finalize().into() + } + + use std::io::Write; #[test] fn test_start_engine_with_second_disk() { let dir = tempfile::Builder::new() @@ -2767,7 +2793,9 @@ pub(crate) mod tests { { std::fs::remove_dir_all(sec_dir.path()).unwrap(); let engine = Engine::open_with_file_system(cfg.clone(), file_system.clone()).unwrap(); + // All files in first dir are copied to second dir assert_eq!(number_of_files(sec_dir.path()), number_of_files(dir.path())); + assert_eq!(calculate_hash(sec_dir.path()), calculate_hash(dir.path())); } // abnormal case - Missing some append files in second dir { @@ -2775,20 +2803,21 @@ pub(crate) mod tests { for e in std::fs::read_dir(sec_dir.path()).unwrap() { let p = e.unwrap().path(); let file_name = p.file_name().unwrap().to_str().unwrap(); - println!("file_name: {}", file_name); if let Some(FileId { queue: LogQueue::Append, seq: _, }) = FileId::parse_file_name(file_name) { if file_count % 2 == 0 { - std::fs::remove_file(dir.path().join(file_name)).unwrap(); + std::fs::remove_file(sec_dir.path().join(file_name)).unwrap(); } file_count += 1; } } let engine = Engine::open_with_file_system(cfg.clone(), file_system.clone()).unwrap(); + // Missing append files are copied assert_eq!(number_of_files(sec_dir.path()), number_of_files(dir.path())); + assert_eq!(calculate_hash(sec_dir.path()), calculate_hash(dir.path())); } // abnormal case - Missing some rewrite files in second dir { @@ -2796,24 +2825,86 @@ pub(crate) mod tests { for e in std::fs::read_dir(sec_dir.path()).unwrap() { let p = e.unwrap().path(); let file_name = p.file_name().unwrap().to_str().unwrap(); - println!("file_name: {}", file_name); if let Some(FileId { queue: LogQueue::Rewrite, seq: _, }) = FileId::parse_file_name(file_name) { if file_count % 2 == 0 { - std::fs::remove_file(dir.path().join(file_name)).unwrap(); + std::fs::remove_file(sec_dir.path().join(file_name)).unwrap(); } file_count += 1; } } - let engine = Engine::open_with_file_system(cfg, file_system).unwrap(); + let engine = Engine::open_with_file_system(cfg.clone(), file_system.clone()).unwrap(); + // Missing rewrite files are copied assert_eq!(number_of_files(sec_dir.path()), number_of_files(dir.path())); + assert_eq!(calculate_hash(sec_dir.path()), calculate_hash(dir.path())); } // abnormal case - Missing some reserve files in second dir + { + let mut file_count = 0; + for e in std::fs::read_dir(sec_dir.path()).unwrap() { + let p = e.unwrap().path(); + let file_name = p.file_name().unwrap().to_str().unwrap(); + if let None = FileId::parse_file_name(file_name) { + if file_count % 2 == 0 { + std::fs::remove_file(sec_dir.path().join(file_name)).unwrap(); + } + file_count += 1; + } + } + let engine = Engine::open_with_file_system(cfg.clone(), file_system.clone()).unwrap(); + // Missing reserve files are copied + assert_eq!(number_of_files(sec_dir.path()), number_of_files(dir.path())); + assert_eq!(calculate_hash(sec_dir.path()), calculate_hash(dir.path())); + } // abnormal case - Have some extra files in second dir + { + let mut file_count = 0; + for e in std::fs::read_dir(sec_dir.path()).unwrap() { + let p = e.unwrap().path(); + let file_name = p.file_name().unwrap().to_str().unwrap(); + if file_count % 2 == 0 { + std::fs::copy( + sec_dir.path().join(file_name), + sec_dir.path().join(file_name.to_owned() + "tmp"), + ) + .unwrap(); + } + } + let engine = Engine::open_with_file_system(cfg.clone(), file_system.clone()).unwrap(); + // Extra files are untouched. + assert_ne!(number_of_files(sec_dir.path()), number_of_files(dir.path())); + assert_eq!(calculate_hash(sec_dir.path()), calculate_hash(dir.path())); + } + // TODO: handle the error // abnormal case - One file is corrupted + { + for e in std::fs::read_dir(sec_dir.path()).unwrap() { + let p = e.unwrap().path(); + let file_name = p.file_name().unwrap().to_str().unwrap(); + if file_count % 2 == 0 { + let mut f = std::fs::OpenOptions::new() + .write(true) + .open(sec_dir.path().join(file_name)) + .unwrap(); + f.write_all(b"corrupted").unwrap(); + } + } + let engine = Engine::open_with_file_system(cfg.clone(), file_system.clone()).unwrap(); + // Corrupted files are untouched. + assert_ne!(number_of_files(sec_dir.path()), number_of_files(dir.path())); + assert_eq!(calculate_hash(sec_dir.path()), calculate_hash(dir.path())); + } + // abnormal case - One file in main dir is corrupted and one file in second dir + // is corrupted + { + + } + // abnormal case - Missing latest rewrite file in main dir and missing one log + // file in second dir + {} } #[test] From e1fc66cbab43ab16e8082f3d948441cc04f503c8 Mon Sep 17 00:00:00 2001 From: Connor1996 Date: Mon, 4 Sep 2023 17:43:12 +0800 Subject: [PATCH 13/32] wait both for rewrite files Signed-off-by: Connor1996 --- src/engine.rs | 6 +- src/env/double_write.rs | 230 +++++++++++++++++++++++++++++----------- 2 files changed, 171 insertions(+), 65 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index 88d90096..e36a1940 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -2878,7 +2878,7 @@ pub(crate) mod tests { assert_ne!(number_of_files(sec_dir.path()), number_of_files(dir.path())); assert_eq!(calculate_hash(sec_dir.path()), calculate_hash(dir.path())); } - // TODO: handle the error + // TODO: handle the error // abnormal case - One file is corrupted { for e in std::fs::read_dir(sec_dir.path()).unwrap() { @@ -2899,9 +2899,7 @@ pub(crate) mod tests { } // abnormal case - One file in main dir is corrupted and one file in second dir // is corrupted - { - - } + {} // abnormal case - Missing latest rewrite file in main dir and missing one log // file in second dir {} diff --git a/src/env/double_write.rs b/src/env/double_write.rs index 2971a2b3..5cb342f8 100644 --- a/src/env/double_write.rs +++ b/src/env/double_write.rs @@ -23,13 +23,14 @@ use std::sync::atomic::Ordering; use std::sync::Arc; use std::sync::Mutex; use std::thread; +use std::thread::JoinHandle; use crate::env::default::LogFd; use crate::env::DefaultFileSystem; use crate::env::{FileSystem, Handle, Permission, WriteExt}; use futures::channel::oneshot; use futures::executor::block_on; -use futures::select; +use futures::{join, select}; use either::Either; @@ -65,9 +66,57 @@ enum Task { Stop, } +impl Task { + fn process(self, file_system: &DefaultFileSystem) -> IoResult { + match self { + Task::Create(path) => file_system.create(&path).map(|h| TaskRes::Create { + fd: h, + is_for_rewrite: path.extension().map_or(false, |ext| ext == "rewrite"), + }), + Task::Open { path, perm } => file_system.open(&path, perm).map(|h| TaskRes::Open { + fd: h, + is_for_rewrite: path.extension().map_or(false, |ext| ext == "rewrite"), + }), + Task::Delete(path) => file_system.delete(path).map(|_| TaskRes::Delete), + Task::Rename { src_path, dst_path } => file_system + .rename(src_path, dst_path) + .map(|_| TaskRes::Rename), + Task::Stop => unreachable!(), + _ => self.handle_process(), + } + } + + fn handle_process(self) -> IoResult { + match self { + Task::Truncate { handle, offset } => { + handle.get().truncate(offset).map(|_| TaskRes::Truncate) + } + Task::FileSize(handle) => handle.get().file_size().map(|s| TaskRes::FileSize(s)), + Task::Sync(handle) => handle.get().sync().map(|_| TaskRes::Sync), + Task::Write { + handle, + offset, + bytes, + } => handle + .get() + .write(offset, &bytes) + .map(|s| TaskRes::Write(s)), + Task::Allocate { + handle, + offset, + size, + } => handle + .get() + .allocate(offset, size) + .map(|_| TaskRes::Allocate), + _ => unreachable!(), + } + } +} + enum TaskRes { - Create(LogFd), - Open(LogFd), + Create { fd: LogFd, is_for_rewrite: bool }, + Open { fd: LogFd, is_for_rewrite: bool }, Delete, Rename, Truncate, @@ -150,7 +199,7 @@ impl HedgedFileSystem { break; } fail_point!("double_write::thread1"); - let res = Self::process(&fs1, task); + let res = task.process(&fs1); cb(res); counter1_clone.fetch_add(1, Ordering::Relaxed); } @@ -162,7 +211,7 @@ impl HedgedFileSystem { if let Task::Stop = task { break; } - let res = Self::process(&fs2, task); + let res = task.process(&fs2); cb(res); counter2_clone.fetch_add(1, Ordering::Relaxed); } @@ -365,22 +414,40 @@ impl HedgedFileSystem { let (cb2, mut f2) = paired_future_callback(); self.sender.send(task1, task2, cb1, cb2); - let resolve = |res: TaskRes| -> LogFd { + let resolve = |res: TaskRes| -> (LogFd, bool) { match res { - TaskRes::Create(h) => h, - TaskRes::Open(h) => h, + TaskRes::Create { fd, is_for_rewrite } => (fd, is_for_rewrite), + TaskRes::Open { fd, is_for_rewrite } => (fd, is_for_rewrite), _ => unreachable!(), } }; select! { - res1 = f1 => res1.unwrap().map(|res| HedgedHandle::new(self.sender.clone(), - FutureHandle::new_owned(resolve(res)), FutureHandle::new(f2) , self.counter1.clone(), self.counter2.clone())), - res2 = f2 => res2.unwrap().map(|res| HedgedHandle::new(self.sender.clone(), - FutureHandle::new(f1), FutureHandle::new_owned(resolve(res)) , self.counter1.clone(), self.counter2.clone())), + res1 = f1 => res1.unwrap().map(|res| { + let (fd, is_for_rewrite) = resolve(res); + HedgedHandle::new( + is_for_rewrite, + self.sender.clone(), + FutureHandle::new_owned(fd), + FutureHandle::new(f2), + self.counter1.clone(), + self.counter2.clone(), + ) + }), + res2 = f2 => res2.unwrap().map(|res| { + let (fd, is_for_rewrite) = resolve(res); + HedgedHandle::new( + is_for_rewrite, + self.sender.clone(), + FutureHandle::new(f1), + FutureHandle::new_owned(fd) , + self.counter1.clone(), + self.counter2.clone(), + ) + }), } } - async fn wait_one(&self, task1: Task, task2: Task) -> IoResult<()> { + async fn wait(&self, task1: Task, task2: Task) -> IoResult<()> { let (cb1, mut f1) = paired_future_callback(); let (cb2, mut f2) = paired_future_callback(); self.sender.send(task1, task2, cb1, cb2); @@ -390,40 +457,6 @@ impl HedgedFileSystem { res2 = f2 => res2.unwrap().map(|_| ()), } } - - #[inline] - fn process(file_system: &DefaultFileSystem, task: Task) -> IoResult { - match task { - Task::Create(path) => file_system.create(path).map(|h| TaskRes::Create(h)), - Task::Open { path, perm } => file_system.open(path, perm).map(|h| TaskRes::Open(h)), - Task::Delete(path) => file_system.delete(path).map(|_| TaskRes::Delete), - Task::Rename { src_path, dst_path } => file_system - .rename(src_path, dst_path) - .map(|_| TaskRes::Rename), - Task::Truncate { handle, offset } => { - handle.get().truncate(offset).map(|_| TaskRes::Truncate) - } - Task::FileSize(handle) => handle.get().file_size().map(|s| TaskRes::FileSize(s)), - Task::Sync(handle) => handle.get().sync().map(|_| TaskRes::Sync), - Task::Write { - handle, - offset, - bytes, - } => handle - .get() - .write(offset, &bytes) - .map(|s| TaskRes::Write(s)), - Task::Allocate { - handle, - offset, - size, - } => handle - .get() - .allocate(offset, size) - .map(|_| TaskRes::Allocate), - Task::Stop => unreachable!(), - } - } } impl Drop for HedgedFileSystem { @@ -503,7 +536,7 @@ impl FileSystem for HedgedFileSystem { } fn delete>(&self, path: P) -> IoResult<()> { - block_on(self.wait_one( + block_on(self.wait( Task::Delete(path.as_ref().to_path_buf()), Task::Delete(replace_path( path.as_ref(), @@ -514,7 +547,7 @@ impl FileSystem for HedgedFileSystem { } fn rename>(&self, src_path: P, dst_path: P) -> IoResult<()> { - block_on(self.wait_one( + block_on(self.wait( Task::Rename { src_path: src_path.as_ref().to_path_buf(), dst_path: dst_path.as_ref().to_path_buf(), @@ -568,8 +601,8 @@ impl FutureHandle { set = true; // TODO: should we handle the second disk io error match block_on(rx).unwrap().unwrap() { - TaskRes::Open(fd) => Arc::new(fd), - TaskRes::Create(fd) => Arc::new(fd), + TaskRes::Open { fd, .. } => Arc::new(fd), + TaskRes::Create { fd, .. } => Arc::new(fd), _ => unreachable!(), } } @@ -592,8 +625,8 @@ impl FutureHandle { match rx.try_recv().unwrap() { None => return None, Some(Err(_)) => panic!(), - Some(Ok(TaskRes::Open(fd))) => Arc::new(fd), - Some(Ok(TaskRes::Create(fd))) => Arc::new(fd), + Some(Ok(TaskRes::Open { fd, .. })) => Arc::new(fd), + Some(Ok(TaskRes::Create { fd, .. })) => Arc::new(fd), _ => unreachable!(), } } @@ -609,6 +642,8 @@ impl FutureHandle { } pub struct HedgedHandle { + strong_consistent: bool, + sender: HedgedSender, handle1: Arc, @@ -616,22 +651,62 @@ pub struct HedgedHandle { counter1: Arc, counter2: Arc, + + thread1: Option>, + thread2: Option>, } impl HedgedHandle { fn new( - sender: HedgedSender, + strong_consistent: bool, + mut sender: HedgedSender, handle1: FutureHandle, handle2: FutureHandle, - counter1: Arc, - counter2: Arc, + mut counter1: Arc, + mut counter2: Arc, ) -> Self { + let mut thread1 = None; + let mut thread2 = None; + if strong_consistent { + // use two separated threads for both wait + let (tx1, rx1) = unbounded::<(Task, Callback)>(); + let (tx2, rx2) = unbounded::<(Task, Callback)>(); + counter1 = Arc::new(AtomicU64::new(0)); + counter2 = Arc::new(AtomicU64::new(0)); + let counter1_clone = counter1.clone(); + thread1 = Some(thread::spawn(move || { + for (task, cb) in rx1 { + if let Task::Stop = task { + break; + } + let res = task.handle_process(); + cb(res); + counter1_clone.fetch_add(1, Ordering::Relaxed); + } + })); + let counter2_clone = counter2.clone(); + thread2 = Some(thread::spawn(move || { + for (task, cb) in rx2 { + if let Task::Stop = task { + break; + } + let res = task.handle_process(); + cb(res); + counter2_clone.fetch_add(1, Ordering::Relaxed); + } + })); + sender = HedgedSender::new(tx1, tx2); + } + Self { + strong_consistent, sender, handle1: Arc::new(handle1), handle2: Arc::new(handle2), counter1, counter2, + thread1, + thread2, } } @@ -656,7 +731,7 @@ impl HedgedHandle { } fn write(&self, offset: usize, content: &[u8]) -> IoResult { - block_on(self.wait_one( + block_on(self.wait( Task::Write { handle: self.handle1.clone(), offset, @@ -678,7 +753,7 @@ impl HedgedHandle { } fn allocate(&self, offset: usize, size: usize) -> IoResult<()> { - block_on(self.wait_one( + block_on(self.wait( Task::Allocate { handle: self.handle1.clone(), offset, @@ -703,11 +778,33 @@ impl HedgedHandle { res2 = f2 => res2.unwrap(), } } + + async fn wait_both(&self, task1: Task, task2: Task) -> IoResult { + let (cb1, f1) = paired_future_callback(); + let (cb2, f2) = paired_future_callback(); + self.sender.send(task1, task2, cb1, cb2); + + let (res1, res2) = join!(f1, f2); + match (res1.unwrap(), res2.unwrap()) { + (res @ Ok(_), Ok(_)) => res, + (Err(e), Err(_)) => Err(e), + (Err(e), _) => Err(e), + (_, Err(e)) => Err(e), + } + } + + async fn wait(&self, task1: Task, task2: Task) -> IoResult { + if self.strong_consistent { + self.wait_both(task1, task2).await + } else { + self.wait_one(task1, task2).await + } + } } impl Handle for HedgedHandle { fn truncate(&self, offset: usize) -> IoResult<()> { - block_on(self.wait_one( + block_on(self.wait( Task::Truncate { handle: self.handle1.clone(), offset, @@ -721,7 +818,7 @@ impl Handle for HedgedHandle { } fn file_size(&self) -> IoResult { - block_on(self.wait_one( + block_on(self.wait( Task::FileSize(self.handle1.clone()), Task::FileSize(self.handle2.clone()), )) @@ -735,7 +832,7 @@ impl Handle for HedgedHandle { } fn sync(&self) -> IoResult<()> { - block_on(self.wait_one( + block_on(self.wait( Task::Sync(self.handle1.clone()), Task::Sync(self.handle2.clone()), )) @@ -743,6 +840,17 @@ impl Handle for HedgedHandle { } } +impl Drop for HedgedHandle { + fn drop(&mut self) { + if self.strong_consistent { + self.sender + .send(Task::Stop, Task::Stop, Box::new(|_| {}), Box::new(|_| {})); + self.thread1.take().unwrap().join().unwrap(); + self.thread2.take().unwrap().join().unwrap(); + } + } +} + pub struct HedgedWriter { inner: Arc, offset: usize, From f25052148b45afaade4344138a49d52c484607f1 Mon Sep 17 00:00:00 2001 From: Connor1996 Date: Mon, 4 Sep 2023 20:48:33 +0800 Subject: [PATCH 14/32] allocate sequential number for task Signed-off-by: Connor1996 --- src/env/double_write.rs | 68 +++++++++++++++++++++++++++++------------ 1 file changed, 48 insertions(+), 20 deletions(-) diff --git a/src/env/double_write.rs b/src/env/double_write.rs index 5cb342f8..2ca93040 100644 --- a/src/env/double_write.rs +++ b/src/env/double_write.rs @@ -36,6 +36,11 @@ use either::Either; type Callback = Box) + Send>; +struct SeqTask { + inner: Task, + seq: u64, +} + enum Task { Create(PathBuf), Open { @@ -63,12 +68,13 @@ enum Task { offset: usize, size: usize, }, + Pause, Stop, } -impl Task { +impl SeqTask { fn process(self, file_system: &DefaultFileSystem) -> IoResult { - match self { + match self.inner { Task::Create(path) => file_system.create(&path).map(|h| TaskRes::Create { fd: h, is_for_rewrite: path.extension().map_or(false, |ext| ext == "rewrite"), @@ -81,13 +87,13 @@ impl Task { Task::Rename { src_path, dst_path } => file_system .rename(src_path, dst_path) .map(|_| TaskRes::Rename), - Task::Stop => unreachable!(), + Task::Stop | Task::Pause => unreachable!(), _ => self.handle_process(), } } fn handle_process(self) -> IoResult { - match self { + match self.inner { Task::Truncate { handle, offset } => { handle.get().truncate(offset).map(|_| TaskRes::Truncate) } @@ -148,20 +154,36 @@ fn replace_path(path: &Path, from: &Path, to: &Path) -> PathBuf { struct HedgedSender(Arc>); struct HedgedSenderInner { - disk1: Sender<(Task, Callback)>, - disk2: Sender<(Task, Callback)>, + disk1: Sender<(SeqTask, Callback)>, + disk2: Sender<(SeqTask, Callback)>, + seq: u64, } impl HedgedSender { fn new( - disk1: Sender<(Task, Callback)>, - disk2: Sender<(Task, Callback)>, + disk1: Sender<(SeqTask, Callback)>, + disk2: Sender<(SeqTask, Callback)>, ) -> Self { - Self(Arc::new(Mutex::new(HedgedSenderInner { disk1, disk2 }))) + Self(Arc::new(Mutex::new(HedgedSenderInner { + disk1, + disk2, + seq: 0, + }))) } fn send(&self, task1: Task, task2: Task, cb1: Callback, cb2: Callback) { - let inner = self.0.lock().unwrap(); + let mut inner = self.0.lock().unwrap(); + if !matches!(task1, Task::Stop | Task::Pause) { + inner.seq += 1; + } + let task1 = SeqTask { + inner: task1, + seq: inner.seq, + }; + let task2 = SeqTask { + inner: task2, + seq: inner.seq, + }; inner.disk1.send((task1, cb1)).unwrap(); inner.disk2.send((task2, cb2)).unwrap(); } @@ -187,33 +209,39 @@ pub struct HedgedFileSystem { impl HedgedFileSystem { pub fn new(base: Arc, path1: PathBuf, path2: PathBuf) -> Self { - let (tx1, rx1) = unbounded::<(Task, Callback)>(); - let (tx2, rx2) = unbounded::<(Task, Callback)>(); + let (tx1, rx1) = unbounded::<(SeqTask, Callback)>(); + let (tx2, rx2) = unbounded::<(SeqTask, Callback)>(); let counter1 = Arc::new(AtomicU64::new(0)); let counter2 = Arc::new(AtomicU64::new(0)); let counter1_clone = counter1.clone(); let fs1 = base.clone(); let handle1 = thread::spawn(move || { for (task, cb) in rx1 { - if let Task::Stop = task { + if let Task::Stop = task.inner { break; } fail_point!("double_write::thread1"); + let seq = task.seq; let res = task.process(&fs1); cb(res); - counter1_clone.fetch_add(1, Ordering::Relaxed); + if seq != 0 { + counter1_clone.store(seq, Ordering::Relaxed); + } } }); let counter2_clone = counter2.clone(); let fs2 = base.clone(); let handle2 = thread::spawn(move || { for (task, cb) in rx2 { - if let Task::Stop = task { + if let Task::Stop = task.inner { break; } + let seq = task.seq; let res = task.process(&fs2); cb(res); - counter2_clone.fetch_add(1, Ordering::Relaxed); + if seq != 0 { + counter2_clone.store(seq, Ordering::Relaxed); + } } }); let sender = HedgedSender::new(tx1, tx2); @@ -669,14 +697,14 @@ impl HedgedHandle { let mut thread2 = None; if strong_consistent { // use two separated threads for both wait - let (tx1, rx1) = unbounded::<(Task, Callback)>(); - let (tx2, rx2) = unbounded::<(Task, Callback)>(); + let (tx1, rx1) = unbounded::<(SeqTask, Callback)>(); + let (tx2, rx2) = unbounded::<(SeqTask, Callback)>(); counter1 = Arc::new(AtomicU64::new(0)); counter2 = Arc::new(AtomicU64::new(0)); let counter1_clone = counter1.clone(); thread1 = Some(thread::spawn(move || { for (task, cb) in rx1 { - if let Task::Stop = task { + if let Task::Stop = task.inner { break; } let res = task.handle_process(); @@ -687,7 +715,7 @@ impl HedgedHandle { let counter2_clone = counter2.clone(); thread2 = Some(thread::spawn(move || { for (task, cb) in rx2 { - if let Task::Stop = task { + if let Task::Stop = task.inner { break; } let res = task.handle_process(); From 6b60bf7a7f920710cc03cbd7e69e9770f983f15f Mon Sep 17 00:00:00 2001 From: Connor1996 Date: Tue, 5 Sep 2023 14:31:27 +0800 Subject: [PATCH 15/32] rename and fix seqno update Signed-off-by: Connor1996 --- src/env/double_write.rs | 93 ++++++++++++++++++++++++++--------------- 1 file changed, 60 insertions(+), 33 deletions(-) diff --git a/src/env/double_write.rs b/src/env/double_write.rs index 2ca93040..eaadeae7 100644 --- a/src/env/double_write.rs +++ b/src/env/double_write.rs @@ -197,8 +197,8 @@ pub struct HedgedFileSystem { sender: HedgedSender, - counter1: Arc, - counter2: Arc, + seqno1: Arc, + seqno2: Arc, handle1: Option>, handle2: Option>, @@ -211,9 +211,9 @@ impl HedgedFileSystem { pub fn new(base: Arc, path1: PathBuf, path2: PathBuf) -> Self { let (tx1, rx1) = unbounded::<(SeqTask, Callback)>(); let (tx2, rx2) = unbounded::<(SeqTask, Callback)>(); - let counter1 = Arc::new(AtomicU64::new(0)); - let counter2 = Arc::new(AtomicU64::new(0)); - let counter1_clone = counter1.clone(); + let seqno1 = Arc::new(AtomicU64::new(0)); + let seqno2 = Arc::new(AtomicU64::new(0)); + let seqno1_clone = seqno1.clone(); let fs1 = base.clone(); let handle1 = thread::spawn(move || { for (task, cb) in rx1 { @@ -223,13 +223,17 @@ impl HedgedFileSystem { fail_point!("double_write::thread1"); let seq = task.seq; let res = task.process(&fs1); - cb(res); if seq != 0 { - counter1_clone.store(seq, Ordering::Relaxed); + // seqno should be updated before the write callback is called, otherwise one + // read may be performed right after the write is finished. Then the read may be + // performed on the other disk not having the data because the seqno for this + // disk is not updated yet. + seqno1_clone.store(seq, Ordering::Relaxed); } + cb(res); } }); - let counter2_clone = counter2.clone(); + let seqno2_clone = seqno2.clone(); let fs2 = base.clone(); let handle2 = thread::spawn(move || { for (task, cb) in rx2 { @@ -238,10 +242,10 @@ impl HedgedFileSystem { } let seq = task.seq; let res = task.process(&fs2); - cb(res); if seq != 0 { - counter2_clone.store(seq, Ordering::Relaxed); + seqno2_clone.store(seq, Ordering::Relaxed); } + cb(res); } }); let sender = HedgedSender::new(tx1, tx2); @@ -250,8 +254,8 @@ impl HedgedFileSystem { path1, path2, sender, - counter1, - counter2, + seqno1, + seqno2, handle1: Some(handle1), handle2: Some(handle2), } @@ -457,8 +461,8 @@ impl HedgedFileSystem { self.sender.clone(), FutureHandle::new_owned(fd), FutureHandle::new(f2), - self.counter1.clone(), - self.counter2.clone(), + self.seqno1.clone(), + self.seqno2.clone(), ) }), res2 = f2 => res2.unwrap().map(|res| { @@ -468,8 +472,8 @@ impl HedgedFileSystem { self.sender.clone(), FutureHandle::new(f1), FutureHandle::new_owned(fd) , - self.counter1.clone(), - self.counter2.clone(), + self.seqno1.clone(), + self.seqno2.clone(), ) }), } @@ -677,8 +681,8 @@ pub struct HedgedHandle { handle1: Arc, handle2: Arc, - counter1: Arc, - counter2: Arc, + seqno1: Arc, + seqno2: Arc, thread1: Option>, thread2: Option>, @@ -690,8 +694,8 @@ impl HedgedHandle { mut sender: HedgedSender, handle1: FutureHandle, handle2: FutureHandle, - mut counter1: Arc, - mut counter2: Arc, + mut seqno1: Arc, + mut seqno2: Arc, ) -> Self { let mut thread1 = None; let mut thread2 = None; @@ -699,28 +703,31 @@ impl HedgedHandle { // use two separated threads for both wait let (tx1, rx1) = unbounded::<(SeqTask, Callback)>(); let (tx2, rx2) = unbounded::<(SeqTask, Callback)>(); - counter1 = Arc::new(AtomicU64::new(0)); - counter2 = Arc::new(AtomicU64::new(0)); - let counter1_clone = counter1.clone(); + // replace the seqno with self owned, then in `read` the seqno from two disks + // should be always the same. It's just to reuse the logic without + // adding special check in `read` + seqno1 = Arc::new(AtomicU64::new(0)); + seqno2 = Arc::new(AtomicU64::new(0)); + let seqno1_clone = seqno1.clone(); thread1 = Some(thread::spawn(move || { for (task, cb) in rx1 { if let Task::Stop = task.inner { break; } let res = task.handle_process(); + seqno1_clone.fetch_add(1, Ordering::Relaxed); cb(res); - counter1_clone.fetch_add(1, Ordering::Relaxed); } })); - let counter2_clone = counter2.clone(); + let seqno2_clone = seqno2.clone(); thread2 = Some(thread::spawn(move || { for (task, cb) in rx2 { if let Task::Stop = task.inner { break; } let res = task.handle_process(); + seqno2_clone.fetch_add(1, Ordering::Relaxed); cb(res); - counter2_clone.fetch_add(1, Ordering::Relaxed); } })); sender = HedgedSender::new(tx1, tx2); @@ -731,20 +738,40 @@ impl HedgedHandle { sender, handle1: Arc::new(handle1), handle2: Arc::new(handle2), - counter1, - counter2, + seqno1, + seqno2, thread1, thread2, } } fn read(&self, offset: usize, buf: &mut [u8]) -> IoResult { - // TODO: read simultaneously from both disks - // choose latest to perform read - let count1 = self.counter1.load(Ordering::Relaxed); - let count2 = self.counter2.load(Ordering::Relaxed); - match count1.cmp(&count2) { + // Raft engine promises that the offset would be read only after the write is + // finished and memtable is updated. And the hedged file system promises that + // the write is done when either one of the disk finishes the write. Here the + // read data must be present in at least one of the disks. So choose the disk of + // largest seqno to read. + // + // Safety: the get for these two seqno is not necessary to be atomic. + // What if the seqno2 is updated after getting seqno1? It's fine, let's say + // - T1 denotes the time of getting seqno1, the actual seqno for disk1 and disk2 + // is S1, S2 + // - T2 denotes the time of getting seqno2, the actual seqno for disk1 and disk2 + // is S1', S2' + // Assume disk2 is just slightly slower than disk1, here is a possible case: + // - T1: S1 = 10, S2 = 9 + // - T2: S1'= 12, S2'= 11 + // Then, what we get would be seq1=10, seq2=11, and the read would be performed + // on disk2. But disk2 is slower than disk1. The data may not be written yet. + // Would the read on a slower disk is safe? + // Yes, it's safe because at T1 we know the data can be read at least with a + // seqno of S1, then at T2, S2' > S1, so the data must be already written in the + // disk2, even if it's the slow disk. + let seq1 = self.seqno1.load(Ordering::Relaxed); + let seq2 = self.seqno2.load(Ordering::Relaxed); + match seq1.cmp(&seq2) { std::cmp::Ordering::Equal => { + // TODO: read simultaneously from both disks and return the faster one if let Some(fd) = self.handle1.try_get() { fd.read(offset, buf) } else if let Some(fd) = self.handle2.try_get() { From 0bbb0bb3a3443ae1443181800dcd1b84a6a0f7bd Mon Sep 17 00:00:00 2001 From: Connor1996 Date: Wed, 6 Sep 2023 21:25:41 +0800 Subject: [PATCH 16/32] introduce pause mechanism Signed-off-by: Connor1996 --- src/env/double_write.rs | 306 +++++++++++++++++++++++++++++++++------- src/purge.rs | 2 + 2 files changed, 258 insertions(+), 50 deletions(-) diff --git a/src/env/double_write.rs b/src/env/double_write.rs index eaadeae7..8f58dd58 100644 --- a/src/env/double_write.rs +++ b/src/env/double_write.rs @@ -22,6 +22,7 @@ use std::sync::atomic::AtomicU64; use std::sync::atomic::Ordering; use std::sync::Arc; use std::sync::Mutex; +use std::sync::RwLock; use std::thread; use std::thread::JoinHandle; @@ -69,6 +70,7 @@ enum Task { size: usize, }, Pause, + Snapshot(PathBuf), Stop, } @@ -87,6 +89,51 @@ impl SeqTask { Task::Rename { src_path, dst_path } => file_system .rename(src_path, dst_path) .map(|_| TaskRes::Rename), + Task::Snapshot(path) => { + let mut snapshot = Snapshot::default(); + fs::read_dir(path) + .unwrap() + .try_for_each(|e| -> IoResult<()> { + let dir_entry = e?; + let p = dir_entry.path(); + if !p.is_file() { + return Ok(()); + } + let file_name = p.file_name().unwrap().to_str().unwrap(); + match FileId::parse_file_name(file_name) { + Some(FileId { + queue: LogQueue::Append, + seq, + }) => snapshot.append_file.push(( + FileName { + seq, + path: p.to_path_buf(), + path_id: 0, + }, + file_system.open(&p, Permission::ReadOnly).unwrap(), + )), + Some(FileId { + queue: LogQueue::Rewrite, + seq, + }) => {} // exclude rewrite files, they are always synced + _ => { + if let Some(seq) = parse_reserved_file_name(file_name) { + snapshot.recycled_file.push(( + FileName { + seq, + path: p.to_path_buf(), + path_id: 0, + }, + file_system.open(&p, Permission::ReadOnly).unwrap(), + )); + } + } + } + Ok(()) + }) + .unwrap(); + Ok(TaskRes::Snapshot(snapshot)) + } Task::Stop | Task::Pause => unreachable!(), _ => self.handle_process(), } @@ -130,6 +177,14 @@ enum TaskRes { Sync, Write(usize), Allocate, + Snapshot(Snapshot), +} + +#[derive(Default)] +struct Snapshot { + append_file: Vec<(FileName, LogFd)>, + recycled_file: Vec<(FileName, LogFd)>, + // exclude rewrite files } #[derive(Default)] @@ -148,8 +203,12 @@ fn replace_path(path: &Path, from: &Path, to: &Path) -> PathBuf { } } +// let say the average entry size is 100B, then the total size of the log in the +// channel is 1GB, +const ABORT_THRESHOLD: usize = 10000; + // Make sure the task is sent to two disks' channel atomically, otherwise the -// ordering of the tasks in two disks are not same. +// ordering of the tasks in two disks' channels are not same. #[derive(Clone)] struct HedgedSender(Arc>); @@ -157,25 +216,30 @@ struct HedgedSenderInner { disk1: Sender<(SeqTask, Callback)>, disk2: Sender<(SeqTask, Callback)>, seq: u64, + state: Arc>, } impl HedgedSender { fn new( disk1: Sender<(SeqTask, Callback)>, disk2: Sender<(SeqTask, Callback)>, + state: Arc>, ) -> Self { Self(Arc::new(Mutex::new(HedgedSenderInner { disk1, disk2, seq: 0, + state, }))) } fn send(&self, task1: Task, task2: Task, cb1: Callback, cb2: Callback) { - let mut inner = self.0.lock().unwrap(); - if !matches!(task1, Task::Stop | Task::Pause) { - inner.seq += 1; + if matches!(task1, Task::Pause | Task::Snapshot) { + unreachable!(); } + + let mut inner = self.0.lock().unwrap(); + inner.seq += 1; let task1 = SeqTask { inner: task1, seq: inner.seq, @@ -184,9 +248,72 @@ impl HedgedSender { inner: task2, seq: inner.seq, }; - inner.disk1.send((task1, cb1)).unwrap(); - inner.disk2.send((task2, cb2)).unwrap(); + let state = inner.state.read().unwrap(); + if state == RecoveryState::Normal { + let check1 = inner.disk1.len() > ABORT_THRESHOLD; + let check2 = inner.disk2.len() > ABORT_THRESHOLD; + match (check1, check2) { + (true, true) => { + panic!("Both channels of disk1 and disk2 are full") + } + (true, false) => { + *inner.state.write().unwrap() = RecoveryState::Paused1; + inner + .disk1 + .send(( + SeqTask { + inner: Task::Pause, + seq: 0, + }, + Box::new(|_| {}), + )) + .unwrap(); + } + (false, true) => { + *inner.state.write().unwrap() = RecoveryState::Paused2; + inner + .disk2 + .send(( + SeqTask { + inner: Task::Pause, + seq: 0, + }, + Box::new(|_| {}), + )) + .unwrap(); + } + _ => {} + } + } + if state != RecoveryState::Paused1 && state != RecoveryState::WaitRecover1 { + inner.disk1.send((task1, cb1)).unwrap(); + } + if state != RecoveryState::Paused2 && state != RecoveryState::WaitRecover2 { + inner.disk2.send((task2, cb2)).unwrap(); + } } + + fn send_snapshot(&self, index: u8, task: Task, cb: Callback) { + assert!(matches!(task1, Task::Pause | Task::Snapshot)); + + let mut inner = self.0.lock().unwrap(); + if index == 1 { + inner.disk1.send((task, cb)).unwrap(); + } else { + inner.disk2.send((task, cb)).unwrap(); + } + } +} + +enum RecoveryState { + Normal, + Paused1, /* When the length of channel of disk1 reaches threshold, a + * `Pause` task is sent and no more later task will be sent + * to disk1 */ + Paused2, // no more task will be sent to disk2 + WaitRecover1(oneshot::Sender<()>), + WaitRecover2(oneshot::Sender<()>), + Recovering, } pub struct HedgedFileSystem { @@ -202,6 +329,8 @@ pub struct HedgedFileSystem { handle1: Option>, handle2: Option>, + + state: Arc>, } // TODO: read both dir at recovery, maybe no need? cause operations are to both @@ -211,23 +340,33 @@ impl HedgedFileSystem { pub fn new(base: Arc, path1: PathBuf, path2: PathBuf) -> Self { let (tx1, rx1) = unbounded::<(SeqTask, Callback)>(); let (tx2, rx2) = unbounded::<(SeqTask, Callback)>(); + let state = Arc::new(RwLock::new(RecoveryState::Normal)); let seqno1 = Arc::new(AtomicU64::new(0)); let seqno2 = Arc::new(AtomicU64::new(0)); let seqno1_clone = seqno1.clone(); let fs1 = base.clone(); + let state1 = state.clone(); let handle1 = thread::spawn(move || { for (task, cb) in rx1 { if let Task::Stop = task.inner { break; } + if let Task::Pause(rx) = task.inner { + let (tx, rx) = oneshot::channel(); + *state1.write().unwrap() = RecoveryState::WaitRecover1(tx); + let _ = rx.recv(); + // indicate the pause is done + // do not update seqno for pause task + continue; + } fail_point!("double_write::thread1"); let seq = task.seq; let res = task.process(&fs1); + // seqno should be updated before the write callback is called, otherwise one + // read may be performed right after the write is finished. Then the read may be + // performed on the other disk not having the data because the seqno for this + // disk is not updated yet. if seq != 0 { - // seqno should be updated before the write callback is called, otherwise one - // read may be performed right after the write is finished. Then the read may be - // performed on the other disk not having the data because the seqno for this - // disk is not updated yet. seqno1_clone.store(seq, Ordering::Relaxed); } cb(res); @@ -235,11 +374,18 @@ impl HedgedFileSystem { }); let seqno2_clone = seqno2.clone(); let fs2 = base.clone(); + let state2 = state.clone(); let handle2 = thread::spawn(move || { for (task, cb) in rx2 { if let Task::Stop = task.inner { break; } + if let Task::Pause(rx) = task.inner { + let (tx, rx) = oneshot::channel(); + *state2.write().unwrap() = RecoveryState::WaitRecover2(tx); + let _ = rx.recv(); + continue; + } let seq = task.seq; let res = task.process(&fs2); if seq != 0 { @@ -248,7 +394,7 @@ impl HedgedFileSystem { cb(res); } }); - let sender = HedgedSender::new(tx1, tx2); + let sender = HedgedSender::new(tx1, tx2, state.clone()); Self { base, path1, @@ -258,11 +404,14 @@ impl HedgedFileSystem { seqno2, handle1: Some(handle1), handle2: Some(handle2), + state, } } fn catch_up_diff(&self, from_files: Files, to_files: Files) -> IoResult<()> { let check_files = |from: &Vec, to: &Vec| -> IoResult<()> { + let last_from_seq = from.last().map(|f| f.seq).unwrap_or(0); + let mut iter1 = from.iter().peekable(); let mut iter2 = to.iter().peekable(); // compare files of from and to, if the file in from is not in to, copy it to @@ -286,12 +435,17 @@ impl HedgedFileSystem { (Some(f1), Some(f2)) => { match f1.seq.cmp(&f2.seq) { std::cmp::Ordering::Equal => { - // TODO: do we need to check file size? - // if f1.handle.file_size() != f2.handle.file_size() { - // let to = replace_path(f1.path.as_ref(), - // from_files.prefix.as_ref(), to_files.prefix.as_ref()); - // fs::copy(&f1.path, &to)?; - // } + // check file size is not enough, treat the last files differently + // considering the recycle, always copy the last file + // TODO: only copy diff part + if f1.seq == last_from_seq { + let to = replace_path( + f1.path.as_ref(), + from_files.prefix.as_ref(), + to_files.prefix.as_ref(), + ); + fs::copy(&f1.path, to)?; + } iter1.next(); iter2.next(); } @@ -318,35 +472,6 @@ impl HedgedFileSystem { check_files(&from_files.append_file, &to_files.append_file)?; check_files(&from_files.rewrite_file, &to_files.rewrite_file)?; check_files(&from_files.recycled_file, &to_files.recycled_file)?; - - // check file size is not enough, treat the last files differently considering - // the recycle, always copy the last file - // TODO: only copy diff part - if let Some(last_file) = from_files.append_file.last() { - let to = replace_path( - last_file.path.as_ref(), - from_files.prefix.as_ref(), - to_files.prefix.as_ref(), - ); - fs::copy(&last_file.path, to)?; - } - if let Some(last_file) = from_files.rewrite_file.last() { - let to = replace_path( - last_file.path.as_ref(), - from_files.prefix.as_ref(), - to_files.prefix.as_ref(), - ); - fs::copy(&last_file.path, to)?; - } - if let Some(last_file) = from_files.recycled_file.last() { - let to = replace_path( - last_file.path.as_ref(), - from_files.prefix.as_ref(), - to_files.prefix.as_ref(), - ); - fs::copy(&last_file.path, to)?; - } - Ok(()) } @@ -526,15 +651,96 @@ impl RecoverExt for HedgedFileSystem { } fn need_recover(&self) -> bool { - false + // in wait recover state, the task should be still dropped + let res = self.state.read().unwrap() == RecoveryState::WaitRecover; + if res { + // in recovering stat, the task can keep sending + *self.state.write().unwrap() = RecoveryState::Recovering; + } + res } fn is_in_recover(&self) -> bool { - false + self.state.read().unwrap() == RecoveryState::Recovering } fn trigger_recover(&self) { - () + // TODO: send task to get snapshot + let (cb, mut f) = paired_future_callback(); + let to_files = match self.state.read().unwrap() { + RecoveryState::WaitRecover1(tx) => { + self.sender + .send_snapshot(1, Task::Snapshot(self.path2.clone()), cb); + self.get_files(&self.path1).unwrap() // TODO: handle error + } + RecoveryState::WaitRecover2(tx) => { + self.sender + .send_snapshot(2, Task::Snapshot(self.path1.clone()), cb); + self.get_files(&self.path2).unwrap() + } + _ => unreachable!(), + }; + + let from_files = blocking_on(f).map(|res| { + if let TaskRes::Snapshot(files) = res { + files + } else { + unreachable!() + } + }); + + let check_files = |from: &Vec<(FileName, LogFd)>, to: &Vec| -> IoResult<()> { + let mut iter1 = from.iter().peekable(); + let mut iter2 = to.iter().peekable(); + // compare files of from and to, if the file in from is not in to, copy it to + // to, and if the file in to is not in from, delete it + loop { + match (iter1.peek(), iter2.peek()) { + (None, None) => break, + (Some(f1), None) => { + let to = replace_path( + f1.path.as_ref(), + from_files.prefix.as_ref(), + to_files.prefix.as_ref(), + ); + io::copy(&f1.path, to)?; + iter1.next(); + } + (None, Some(f2)) => { + fs::remove_file(&f2.path)?; + iter2.next(); + } + (Some(f1), Some(f2)) => { + match f1.seq.cmp(&f2.seq) { + std::cmp::Ordering::Equal => { + iter1.next(); + iter2.next(); + } + std::cmp::Ordering::Less => { + let to = replace_path( + f1.path.as_ref(), + from_files.prefix.as_ref(), + to_files.prefix.as_ref(), + ); + fs::copy(&f1.path, to)?; + iter1.next(); + } + std::cmp::Ordering::Greater => { + fs::remove_file(&f2.path)?; + iter2.next(); + } + } + } + } + } + Ok(()) + }; + + // TODO: async + self.catch_up_diff(from_files, to_files) + + // when + *self.state.write().unwrap() = RecoveryState::Normal; } } @@ -730,7 +936,7 @@ impl HedgedHandle { cb(res); } })); - sender = HedgedSender::new(tx1, tx2); + sender = HedgedSender::new(tx1, tx2, sender.state.clone()); } Self { diff --git a/src/purge.rs b/src/purge.rs index 8a6e65d8..c7f889fc 100644 --- a/src/purge.rs +++ b/src/purge.rs @@ -75,6 +75,8 @@ where pub fn purge_expired_files(&self) -> Result> { let _t = StopWatch::new(&*ENGINE_PURGE_DURATION_HISTOGRAM); + // Purge would delete files, whereas the files may be copied by recovery + // process, so do not purge when recovering. // if self.file_system().is_in_recover() { // info!("skip purge due to in recover"); // return Ok(vec![]); From 026861a5d7e1436ce68dce7c06fb75eaa0600235 Mon Sep 17 00:00:00 2001 From: Connor1996 Date: Thu, 7 Sep 2023 22:09:21 +0800 Subject: [PATCH 17/32] reuse catch up diff Signed-off-by: Connor1996 --- src/engine.rs | 3 - src/env/double_write.rs | 312 ++++++++++++++++++++++------------------ 2 files changed, 171 insertions(+), 144 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index e36a1940..b85c0d7c 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -146,9 +146,6 @@ where stats.clone(), listeners.clone(), ); - // HedgingManager::new( - // pipe_log.clone(), - // ) let (tx, rx) = mpsc::channel(); let stats_clone = stats.clone(); diff --git a/src/env/double_write.rs b/src/env/double_write.rs index 8f58dd58..fb86b30d 100644 --- a/src/env/double_write.rs +++ b/src/env/double_write.rs @@ -1,5 +1,6 @@ // Copyright (c) 2017-present, PingCAP, Inc. Licensed under Apache-2.0. +use crate::env::default::LogFile; use crate::env::RecoverExt; use crate::file_pipe_log::log_file::build_file_reader; use crate::file_pipe_log::pipe_builder::FileName; @@ -90,7 +91,10 @@ impl SeqTask { .rename(src_path, dst_path) .map(|_| TaskRes::Rename), Task::Snapshot(path) => { - let mut snapshot = Snapshot::default(); + let mut files = Files { + prefix: path.clone(), + ..Default::default() + }; fs::read_dir(path) .unwrap() .try_for_each(|e| -> IoResult<()> { @@ -104,35 +108,37 @@ impl SeqTask { Some(FileId { queue: LogQueue::Append, seq, - }) => snapshot.append_file.push(( + }) => files.append_files.push(SeqFile::Handle(( FileName { seq, path: p.to_path_buf(), path_id: 0, }, - file_system.open(&p, Permission::ReadOnly).unwrap(), - )), + Arc::new(file_system.open(&p, Permission::ReadOnly).unwrap()), + ))), Some(FileId { queue: LogQueue::Rewrite, - seq, + .. }) => {} // exclude rewrite files, they are always synced _ => { if let Some(seq) = parse_reserved_file_name(file_name) { - snapshot.recycled_file.push(( + files.reserved_files.push(SeqFile::Handle(( FileName { seq, path: p.to_path_buf(), path_id: 0, }, - file_system.open(&p, Permission::ReadOnly).unwrap(), - )); + Arc::new( + file_system.open(&p, Permission::ReadOnly).unwrap(), + ), + ))); } } } Ok(()) }) .unwrap(); - Ok(TaskRes::Snapshot(snapshot)) + Ok(TaskRes::Snapshot(files)) } Task::Stop | Task::Pause => unreachable!(), _ => self.handle_process(), @@ -177,22 +183,54 @@ enum TaskRes { Sync, Write(usize), Allocate, - Snapshot(Snapshot), -} - -#[derive(Default)] -struct Snapshot { - append_file: Vec<(FileName, LogFd)>, - recycled_file: Vec<(FileName, LogFd)>, - // exclude rewrite files + Snapshot(Files), } #[derive(Default)] struct Files { prefix: PathBuf, - append_file: Vec, - rewrite_file: Vec, - recycled_file: Vec, + append_files: Vec, + rewrite_files: Vec, + reserved_files: Vec, +} + +enum SeqFile { + Path(FileName), + Handle((FileName, Arc)), +} + +impl SeqFile { + fn seq(&self) -> u64 { + match self { + SeqFile::Path(f) => f.seq, + SeqFile::Handle((f, _)) => f.seq, + } + } + + fn path(&self) -> &PathBuf { + match self { + SeqFile::Path(f) => &f.path, + SeqFile::Handle((f, _)) => &f.path, + } + } + + fn remove(&self) -> IoResult<()> { + match self { + SeqFile::Path(f) => fs::remove_file(&f.path), + SeqFile::Handle((f, _)) => fs::remove_file(&f.path), + } + } + + fn copy(&self, file_system: &DefaultFileSystem, to: &PathBuf) -> IoResult { + match self { + SeqFile::Path(f) => fs::copy(&f.path, to.as_path()), + SeqFile::Handle((_, fd)) => { + let mut reader = LogFile::new(fd.clone()); + let mut writer = LogFile::new(Arc::new(file_system.create(to)?)); + std::io::copy(&mut reader, &mut writer) + } + } + } } fn replace_path(path: &Path, from: &Path, to: &Path) -> PathBuf { @@ -233,8 +271,12 @@ impl HedgedSender { }))) } + fn state(&self) -> Arc> { + self.0.lock().unwrap().state.clone() + } + fn send(&self, task1: Task, task2: Task, cb1: Callback, cb2: Callback) { - if matches!(task1, Task::Pause | Task::Snapshot) { + if matches!(task1, Task::Pause | Task::Snapshot(_)) { unreachable!(); } @@ -249,7 +291,7 @@ impl HedgedSender { seq: inner.seq, }; let state = inner.state.read().unwrap(); - if state == RecoveryState::Normal { + if matches!(*state, RecoveryState::Normal) { let check1 = inner.disk1.len() > ABORT_THRESHOLD; let check2 = inner.disk2.len() > ABORT_THRESHOLD; match (check1, check2) { @@ -285,22 +327,46 @@ impl HedgedSender { _ => {} } } - if state != RecoveryState::Paused1 && state != RecoveryState::WaitRecover1 { + if !matches!( + *state, + RecoveryState::Paused1 | RecoveryState::WaitRecover1(_) + ) { inner.disk1.send((task1, cb1)).unwrap(); } - if state != RecoveryState::Paused2 && state != RecoveryState::WaitRecover2 { + if !matches!( + *state, + RecoveryState::Paused2 | RecoveryState::WaitRecover2(_) + ) { inner.disk2.send((task2, cb2)).unwrap(); } } fn send_snapshot(&self, index: u8, task: Task, cb: Callback) { - assert!(matches!(task1, Task::Pause | Task::Snapshot)); + assert!(matches!(task, Task::Snapshot(_))); - let mut inner = self.0.lock().unwrap(); + let inner = self.0.lock().unwrap(); if index == 1 { - inner.disk1.send((task, cb)).unwrap(); + inner + .disk1 + .send(( + SeqTask { + inner: task, + seq: 0, + }, + cb, + )) + .unwrap(); } else { - inner.disk2.send((task, cb)).unwrap(); + inner + .disk2 + .send(( + SeqTask { + inner: task, + seq: 0, + }, + cb, + )) + .unwrap(); } } } @@ -351,10 +417,10 @@ impl HedgedFileSystem { if let Task::Stop = task.inner { break; } - if let Task::Pause(rx) = task.inner { + if let Task::Pause = task.inner { let (tx, rx) = oneshot::channel(); *state1.write().unwrap() = RecoveryState::WaitRecover1(tx); - let _ = rx.recv(); + let _ = block_on(rx); // indicate the pause is done // do not update seqno for pause task continue; @@ -380,10 +446,10 @@ impl HedgedFileSystem { if let Task::Stop = task.inner { break; } - if let Task::Pause(rx) = task.inner { + if let Task::Pause = task.inner { let (tx, rx) = oneshot::channel(); *state2.write().unwrap() = RecoveryState::WaitRecover2(tx); - let _ = rx.recv(); + let _ = block_on(rx); continue; } let seq = task.seq; @@ -408,10 +474,17 @@ impl HedgedFileSystem { } } - fn catch_up_diff(&self, from_files: Files, to_files: Files) -> IoResult<()> { - let check_files = |from: &Vec, to: &Vec| -> IoResult<()> { - let last_from_seq = from.last().map(|f| f.seq).unwrap_or(0); - + fn catch_up_diff(&self, mut from_files:Files, mut to_files: Files, skip_rewrite: bool) -> IoResult<()> { + from_files.append_files.sort_by(|a, b| a.seq().cmp(&b.seq())); + to_files.append_files.sort_by(|a, b| a.seq().cmp(&b.seq())); + from_files.rewrite_files.sort_by(|a, b| a.seq().cmp(&b.seq())); + to_files.rewrite_files.sort_by(|a, b| a.seq().cmp(&b.seq())); + from_files.reserved_files.sort_by(|a, b| a.seq().cmp(&b.seq())); + to_files.reserved_files.sort_by(|a, b| a.seq().cmp(&b.seq())); + + let check_files = |from: &Vec, to: &Vec| -> IoResult<()> { + let last_from_seq = from.last().map(|f| f.seq()).unwrap_or(0); + let mut iter1 = from.iter().peekable(); let mut iter2 = to.iter().peekable(); // compare files of from and to, if the file in from is not in to, copy it to @@ -421,45 +494,45 @@ impl HedgedFileSystem { (None, None) => break, (Some(f1), None) => { let to = replace_path( - f1.path.as_ref(), + f1.path().as_ref(), from_files.prefix.as_ref(), to_files.prefix.as_ref(), ); - fs::copy(&f1.path, to)?; + f1.copy(&self.base, &to)?; iter1.next(); } (None, Some(f2)) => { - fs::remove_file(&f2.path)?; + f2.remove()?; iter2.next(); } (Some(f1), Some(f2)) => { - match f1.seq.cmp(&f2.seq) { + match f1.seq().cmp(&f2.seq()) { std::cmp::Ordering::Equal => { // check file size is not enough, treat the last files differently // considering the recycle, always copy the last file // TODO: only copy diff part - if f1.seq == last_from_seq { + if f1.seq() == last_from_seq { let to = replace_path( - f1.path.as_ref(), + f1.path().as_ref(), from_files.prefix.as_ref(), to_files.prefix.as_ref(), ); - fs::copy(&f1.path, to)?; + f1.copy(&self.base, &to)?; } iter1.next(); iter2.next(); } std::cmp::Ordering::Less => { let to = replace_path( - f1.path.as_ref(), + f1.path().as_ref(), from_files.prefix.as_ref(), to_files.prefix.as_ref(), ); - fs::copy(&f1.path, to)?; + f1.copy(&self.base, &to)?; iter1.next(); } std::cmp::Ordering::Greater => { - fs::remove_file(&f2.path)?; + f2.remove()?; iter2.next(); } } @@ -469,9 +542,11 @@ impl HedgedFileSystem { Ok(()) }; - check_files(&from_files.append_file, &to_files.append_file)?; - check_files(&from_files.rewrite_file, &to_files.rewrite_file)?; - check_files(&from_files.recycled_file, &to_files.recycled_file)?; + check_files(&from_files.append_files, &to_files.append_files)?; + if !skip_rewrite { + check_files(&from_files.rewrite_files, &to_files.rewrite_files)?; + } + check_files(&from_files.reserved_files, &to_files.reserved_files)?; Ok(()) } @@ -498,49 +573,47 @@ impl HedgedFileSystem { Some(FileId { queue: LogQueue::Append, seq, - }) => files.append_file.push(FileName { + }) => files.append_files.push(SeqFile::Path(FileName { seq, path: p, path_id: 0, - }), + })), Some(FileId { queue: LogQueue::Rewrite, seq, - }) => files.rewrite_file.push(FileName { + }) => files.rewrite_files.push(SeqFile::Path(FileName { seq, path: p, path_id: 0, - }), + })), _ => { if let Some(seq) = parse_reserved_file_name(file_name) { - files.recycled_file.push(FileName { + files.reserved_files.push(SeqFile::Path(FileName { seq, path: p, path_id: 0, - }) + })) } } } Ok(()) }) .unwrap(); - files.append_file.sort_by(|a, b| a.seq.cmp(&b.seq)); - files.rewrite_file.sort_by(|a, b| a.seq.cmp(&b.seq)); - files.recycled_file.sort_by(|a, b| a.seq.cmp(&b.seq)); + Ok(files) } fn get_latest_valid_seq(&self, files: &Files) -> IoResult { let mut count = 0; - if let Some(f) = files.append_file.last() { + if let Some(f) = files.append_files.last() { let recovery_read_block_size = 1024; let mut reader = LogItemBatchFileReader::new(recovery_read_block_size); - let handle = Arc::new(self.base.open(&f.path, Permission::ReadOnly)?); + let handle = Arc::new(self.base.open(&f.path(), Permission::ReadOnly)?); let file_reader = build_file_reader(self.base.as_ref(), handle)?; match reader.open( FileId { queue: LogQueue::Append, - seq: f.seq, + seq: f.seq(), }, file_reader, ) { @@ -637,14 +710,14 @@ impl RecoverExt for HedgedFileSystem { match count1.cmp(&count2) { std::cmp::Ordering::Equal => { // still need to catch up, but only diff - self.catch_up_diff(files1, files2)?; + self.catch_up_diff(files1, files2, false)?; return Ok(()); } std::cmp::Ordering::Less => { - self.catch_up_diff(files2, files1)?; + self.catch_up_diff(files2, files1, false)?; } std::cmp::Ordering::Greater => { - self.catch_up_diff(files1, files2)?; + self.catch_up_diff(files1, files2, false)?; } } Ok(()) @@ -652,7 +725,10 @@ impl RecoverExt for HedgedFileSystem { fn need_recover(&self) -> bool { // in wait recover state, the task should be still dropped - let res = self.state.read().unwrap() == RecoveryState::WaitRecover; + let res = matches!( + *self.state.read().unwrap(), + RecoveryState::WaitRecover1(_) | RecoveryState::WaitRecover2(_) + ); if res { // in recovering stat, the task can keep sending *self.state.write().unwrap() = RecoveryState::Recovering; @@ -661,86 +737,40 @@ impl RecoverExt for HedgedFileSystem { } fn is_in_recover(&self) -> bool { - self.state.read().unwrap() == RecoveryState::Recovering + matches!(*self.state.read().unwrap(), RecoveryState::Recovering) } fn trigger_recover(&self) { - // TODO: send task to get snapshot - let (cb, mut f) = paired_future_callback(); - let to_files = match self.state.read().unwrap() { - RecoveryState::WaitRecover1(tx) => { - self.sender - .send_snapshot(1, Task::Snapshot(self.path2.clone()), cb); - self.get_files(&self.path1).unwrap() // TODO: handle error - } - RecoveryState::WaitRecover2(tx) => { - self.sender - .send_snapshot(2, Task::Snapshot(self.path1.clone()), cb); - self.get_files(&self.path2).unwrap() - } - _ => unreachable!(), - }; - - let from_files = blocking_on(f).map(|res| { - if let TaskRes::Snapshot(files) = res { - files - } else { - unreachable!() - } - }); - - let check_files = |from: &Vec<(FileName, LogFd)>, to: &Vec| -> IoResult<()> { - let mut iter1 = from.iter().peekable(); - let mut iter2 = to.iter().peekable(); - // compare files of from and to, if the file in from is not in to, copy it to - // to, and if the file in to is not in from, delete it - loop { - match (iter1.peek(), iter2.peek()) { - (None, None) => break, - (Some(f1), None) => { - let to = replace_path( - f1.path.as_ref(), - from_files.prefix.as_ref(), - to_files.prefix.as_ref(), - ); - io::copy(&f1.path, to)?; - iter1.next(); - } - (None, Some(f2)) => { - fs::remove_file(&f2.path)?; - iter2.next(); - } - (Some(f1), Some(f2)) => { - match f1.seq.cmp(&f2.seq) { - std::cmp::Ordering::Equal => { - iter1.next(); - iter2.next(); - } - std::cmp::Ordering::Less => { - let to = replace_path( - f1.path.as_ref(), - from_files.prefix.as_ref(), - to_files.prefix.as_ref(), - ); - fs::copy(&f1.path, to)?; - iter1.next(); - } - std::cmp::Ordering::Greater => { - fs::remove_file(&f2.path)?; - iter2.next(); - } - } - } - } - } - Ok(()) - }; + // let (cb, f) = paired_future_callback(); + // let to_files = match *self.state.read().unwrap() { + // RecoveryState::WaitRecover1(tx) => { + // self.sender + // .send_snapshot(1, Task::Snapshot(self.path2.clone()), cb); + // self.get_files(&self.path1).unwrap() // TODO: handle error + // } + // RecoveryState::WaitRecover2(tx) => { + // self.sender + // .send_snapshot(2, Task::Snapshot(self.path1.clone()), cb); + // self.get_files(&self.path2).unwrap() + // } + // _ => unreachable!(), + // }; + + // let from_files = block_on(f).unwrap().map(|res| { + // if let TaskRes::Snapshot(files) = res { + // files + // } else { + // unreachable!() + // } + // }).unwrap(); // TODO: handle error // TODO: async - self.catch_up_diff(from_files, to_files) + // exclude rewrite files because rewrite files are always synced + // self.catch_up_diff(from_files, to_files, true); - // when + // when *self.state.write().unwrap() = RecoveryState::Normal; + // tx.send(()).unwrap(); } } @@ -936,7 +966,7 @@ impl HedgedHandle { cb(res); } })); - sender = HedgedSender::new(tx1, tx2, sender.state.clone()); + sender = HedgedSender::new(tx1, tx2, sender.state()); } Self { @@ -1104,8 +1134,8 @@ impl Handle for HedgedHandle { impl Drop for HedgedHandle { fn drop(&mut self) { if self.strong_consistent { - self.sender - .send(Task::Stop, Task::Stop, Box::new(|_| {}), Box::new(|_| {})); + // self.sender + // .send(Task::Stop, Task::Stop, Box::new(|_| {}), Box::new(|_| {})); self.thread1.take().unwrap().join().unwrap(); self.thread2.take().unwrap().join().unwrap(); } From 9efb9a4b282f14d86b960e69c6c22146c9e4cb19 Mon Sep 17 00:00:00 2001 From: Connor1996 Date: Tue, 12 Sep 2023 13:50:31 +0800 Subject: [PATCH 18/32] non blocking drop Signed-off-by: Connor1996 --- src/env/double_write.rs | 72 ++++++++++++++++++++++++++++++++--------- 1 file changed, 56 insertions(+), 16 deletions(-) diff --git a/src/env/double_write.rs b/src/env/double_write.rs index fb86b30d..eaa571fc 100644 --- a/src/env/double_write.rs +++ b/src/env/double_write.rs @@ -118,7 +118,7 @@ impl SeqTask { ))), Some(FileId { queue: LogQueue::Rewrite, - .. + .. }) => {} // exclude rewrite files, they are always synced _ => { if let Some(seq) = parse_reserved_file_name(file_name) { @@ -184,6 +184,7 @@ enum TaskRes { Write(usize), Allocate, Snapshot(Files), + Stop, } #[derive(Default)] @@ -415,6 +416,7 @@ impl HedgedFileSystem { let handle1 = thread::spawn(move || { for (task, cb) in rx1 { if let Task::Stop = task.inner { + cb(Ok(TaskRes::Stop)); break; } if let Task::Pause = task.inner { @@ -444,6 +446,7 @@ impl HedgedFileSystem { let handle2 = thread::spawn(move || { for (task, cb) in rx2 { if let Task::Stop = task.inner { + cb(Ok(TaskRes::Stop)); break; } if let Task::Pause = task.inner { @@ -474,17 +477,30 @@ impl HedgedFileSystem { } } - fn catch_up_diff(&self, mut from_files:Files, mut to_files: Files, skip_rewrite: bool) -> IoResult<()> { - from_files.append_files.sort_by(|a, b| a.seq().cmp(&b.seq())); - to_files.append_files.sort_by(|a, b| a.seq().cmp(&b.seq())); - from_files.rewrite_files.sort_by(|a, b| a.seq().cmp(&b.seq())); - to_files.rewrite_files.sort_by(|a, b| a.seq().cmp(&b.seq())); - from_files.reserved_files.sort_by(|a, b| a.seq().cmp(&b.seq())); - to_files.reserved_files.sort_by(|a, b| a.seq().cmp(&b.seq())); - + fn catch_up_diff( + &self, + mut from_files: Files, + mut to_files: Files, + skip_rewrite: bool, + ) -> IoResult<()> { + from_files + .append_files + .sort_by(|a, b| a.seq().cmp(&b.seq())); + to_files.append_files.sort_by(|a, b| a.seq().cmp(&b.seq())); + from_files + .rewrite_files + .sort_by(|a, b| a.seq().cmp(&b.seq())); + to_files.rewrite_files.sort_by(|a, b| a.seq().cmp(&b.seq())); + from_files + .reserved_files + .sort_by(|a, b| a.seq().cmp(&b.seq())); + to_files + .reserved_files + .sort_by(|a, b| a.seq().cmp(&b.seq())); + let check_files = |from: &Vec, to: &Vec| -> IoResult<()> { let last_from_seq = from.last().map(|f| f.seq()).unwrap_or(0); - + let mut iter1 = from.iter().peekable(); let mut iter2 = to.iter().peekable(); // compare files of from and to, if the file in from is not in to, copy it to @@ -691,10 +707,34 @@ impl HedgedFileSystem { impl Drop for HedgedFileSystem { fn drop(&mut self) { - self.sender - .send(Task::Stop, Task::Stop, Box::new(|_| {}), Box::new(|_| {})); - self.handle1.take().unwrap().join().unwrap(); - self.handle2.take().unwrap().join().unwrap(); + block_on(self.wait(Task::Stop, Task::Stop)).unwrap(); + + let t1 = self.handle1.take().unwrap(); + let t2 = self.handle2.take().unwrap(); + let mut times = 0; + loop { + // wait 1s + if t1.is_finished() && t2.is_finished() { + t1.join().unwrap(); + t2.join().unwrap(); + break; + } + times += 1; + if times > 100 { + // one disk may be blocked for a long time, + // to avoid block shutdown process for a long time, do not join the threads + // here, only need at least to ensure one thread is exited + if t1.is_finished() || t2.is_finished() { + if t1.is_finished() { + t1.join().unwrap(); + } else { + t2.join().unwrap(); + } + break; + } + } + std::thread::sleep(std::time::Duration::from_millis(10)); + } } } @@ -1134,8 +1174,8 @@ impl Handle for HedgedHandle { impl Drop for HedgedHandle { fn drop(&mut self) { if self.strong_consistent { - // self.sender - // .send(Task::Stop, Task::Stop, Box::new(|_| {}), Box::new(|_| {})); + self.sender + .send(Task::Stop, Task::Stop, Box::new(|_| {}), Box::new(|_| {})); self.thread1.take().unwrap().join().unwrap(); self.thread2.take().unwrap().join().unwrap(); } From b8df7eb8178e0f7d705308d8046dcacc68c35343 Mon Sep 17 00:00:00 2001 From: Connor1996 Date: Tue, 12 Sep 2023 13:50:41 +0800 Subject: [PATCH 19/32] change test Signed-off-by: Connor1996 --- tests/failpoints/test_engine.rs | 29 ++++++++++------------------- 1 file changed, 10 insertions(+), 19 deletions(-) diff --git a/tests/failpoints/test_engine.rs b/tests/failpoints/test_engine.rs index 44fe5a56..58b8d5ff 100644 --- a/tests/failpoints/test_engine.rs +++ b/tests/failpoints/test_engine.rs @@ -1,6 +1,6 @@ // Copyright (c) 2017-present, PingCAP, Inc. Licensed under Apache-2.0. -use raft_engine::env::HedgedFileSystem; +use raft_engine::env::{DefaultFileSystem, HedgedFileSystem}; use std::path::Path; use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; use std::sync::{Arc, Barrier}; @@ -1217,8 +1217,8 @@ fn test_start_engine_with_slow_second_disk() { .tempdir() .unwrap(); - fail::cfg("double_write::thread1", "pause").unwrap(); let file_system = Arc::new(HedgedFileSystem::new( + Arc::new(DefaultFileSystem {}), dir.path().to_path_buf(), sec_dir.path().to_path_buf(), )); @@ -1233,7 +1233,8 @@ fn test_start_engine_with_slow_second_disk() { }; // Step 1: write data into the main directory. - let engine = Engine::open_with_file_system(cfg.clone(), file_system.clone()).unwrap(); + let engine = Engine::open_with_file_system(cfg.clone(), file_system).unwrap(); + fail::cfg("double_write::thread1", "pause").unwrap(); for rid in 1..=10 { append(&engine, rid, 1, 10, Some(&entry_data)); } @@ -1253,6 +1254,11 @@ fn test_start_engine_with_slow_second_disk() { purge_threshold: ReadableSize(40), ..cfg }; + let file_system = Arc::new(HedgedFileSystem::new( + Arc::new(DefaultFileSystem {}), + dir.path().to_path_buf(), + sec_dir.path().to_path_buf(), + )); let engine = Engine::open_with_file_system(cfg_2, file_system).unwrap(); assert_eq!(number_of_files(sec_dir.path()), number_of_files(dir.path())); for rid in 1..=10 { @@ -1262,23 +1268,8 @@ fn test_start_engine_with_slow_second_disk() { engine.write(&mut log_batch, true).unwrap(); } assert_eq!(number_of_files(sec_dir.path()), number_of_files(dir.path())); - fail::cfg("double_write::thread1", "pause").unwrap(); engine.purge_manager().must_rewrite_append_queue(None, None); - assert_ne!(number_of_files(sec_dir.path()), number_of_files(dir.path())); - fail::remove("double_write::thread1"); - - let mut times = 0; - loop { - if number_of_files(sec_dir.path()) == number_of_files(dir.path()) { - break; - } - if times > 50 { - panic!("rewrite queue is not finished"); - } - times += 1; - std::thread::sleep(Duration::from_millis(10)); - } - + assert_eq!(number_of_files(sec_dir.path()), number_of_files(dir.path())); let file_count = number_of_files(dir.path()); // Append data, recycled files are reused. for rid in 1..=30 { From bcd73bf14b4c5cd7d924f967c2b9ce4354a2f666 Mon Sep 17 00:00:00 2001 From: Connor1996 Date: Tue, 12 Sep 2023 16:13:24 +0800 Subject: [PATCH 20/32] extract task runner Signed-off-by: Connor1996 --- src/env/double_write.rs | 154 ++++++++++++++++++++++++---------------- 1 file changed, 93 insertions(+), 61 deletions(-) diff --git a/src/env/double_write.rs b/src/env/double_write.rs index eaa571fc..c5e4fdff 100644 --- a/src/env/double_write.rs +++ b/src/env/double_write.rs @@ -11,7 +11,7 @@ use crate::internals::FileId; use crate::internals::LogQueue; use crate::Error; use crossbeam::channel::unbounded; -use crossbeam::channel::Sender; +use crossbeam::channel::{Receiver, Sender}; use fail::fail_point; use log::{info, warn}; use std::cell::UnsafeCell; @@ -394,15 +394,82 @@ pub struct HedgedFileSystem { seqno1: Arc, seqno2: Arc, - handle1: Option>, - handle2: Option>, + thread1: Option>, + thread2: Option>, + + state: Arc>, +} +struct TaskRunner { + id: u8, + path: PathBuf, + fs: Arc, + rx: Receiver<(SeqTask, Callback)>, + seqno: Arc, state: Arc>, } +impl TaskRunner { + fn new( + id: u8, + path: PathBuf, + fs: Arc, + rx: Receiver<(SeqTask, Callback)>, + seqno: Arc, + state: Arc>, + ) -> Self { + Self { + id, + path, + fs, + rx, + seqno, + state, + } + } + + fn spawn(self) -> JoinHandle<()> { + thread::Builder::new() + .name(format!("raft-engine-disk{}", self.id)) + .spawn(move || { + for (task, cb) in self.rx { + if let Task::Stop = task.inner { + cb(Ok(TaskRes::Stop)); + break; + } + if let Task::Pause = task.inner { + let (tx, rx) = oneshot::channel(); + *self.state.write().unwrap() = if self.id == 1 { + RecoveryState::WaitRecover1(tx) + } else { + RecoveryState::WaitRecover2(tx) + }; + let _ = block_on(rx); + // indicate the pause is done + // do not update seqno for pause task + continue; + } + if self.id == 1 { + fail_point!("double_write::thread1"); + } + let seq = task.seq; + let res = task.process(&self.fs); + // seqno should be updated before the write callback is called, otherwise one + // read may be performed right after the write is finished. Then the read may be + // performed on the other disk not having the data because the seqno for this + // disk is not updated yet. + if seq != 0 { + self.seqno.store(seq, Ordering::Relaxed); + } + cb(res); + } + }) + .unwrap() + } +} + // TODO: read both dir at recovery, maybe no need? cause operations are to both // disks TODO: consider encryption - impl HedgedFileSystem { pub fn new(base: Arc, path1: PathBuf, path2: PathBuf) -> Self { let (tx1, rx1) = unbounded::<(SeqTask, Callback)>(); @@ -410,59 +477,24 @@ impl HedgedFileSystem { let state = Arc::new(RwLock::new(RecoveryState::Normal)); let seqno1 = Arc::new(AtomicU64::new(0)); let seqno2 = Arc::new(AtomicU64::new(0)); - let seqno1_clone = seqno1.clone(); - let fs1 = base.clone(); - let state1 = state.clone(); - let handle1 = thread::spawn(move || { - for (task, cb) in rx1 { - if let Task::Stop = task.inner { - cb(Ok(TaskRes::Stop)); - break; - } - if let Task::Pause = task.inner { - let (tx, rx) = oneshot::channel(); - *state1.write().unwrap() = RecoveryState::WaitRecover1(tx); - let _ = block_on(rx); - // indicate the pause is done - // do not update seqno for pause task - continue; - } - fail_point!("double_write::thread1"); - let seq = task.seq; - let res = task.process(&fs1); - // seqno should be updated before the write callback is called, otherwise one - // read may be performed right after the write is finished. Then the read may be - // performed on the other disk not having the data because the seqno for this - // disk is not updated yet. - if seq != 0 { - seqno1_clone.store(seq, Ordering::Relaxed); - } - cb(res); - } - }); - let seqno2_clone = seqno2.clone(); - let fs2 = base.clone(); - let state2 = state.clone(); - let handle2 = thread::spawn(move || { - for (task, cb) in rx2 { - if let Task::Stop = task.inner { - cb(Ok(TaskRes::Stop)); - break; - } - if let Task::Pause = task.inner { - let (tx, rx) = oneshot::channel(); - *state2.write().unwrap() = RecoveryState::WaitRecover2(tx); - let _ = block_on(rx); - continue; - } - let seq = task.seq; - let res = task.process(&fs2); - if seq != 0 { - seqno2_clone.store(seq, Ordering::Relaxed); - } - cb(res); - } - }); + let runner1 = TaskRunner::new( + 1, + path1.clone(), + base.clone(), + rx1, + seqno1.clone(), + state.clone(), + ); + let runner2 = TaskRunner::new( + 2, + path2.clone(), + base.clone(), + rx2, + seqno2.clone(), + state.clone(), + ); + let thread1 = runner1.spawn(); + let thread2 = runner2.spawn(); let sender = HedgedSender::new(tx1, tx2, state.clone()); Self { base, @@ -471,8 +503,8 @@ impl HedgedFileSystem { sender, seqno1, seqno2, - handle1: Some(handle1), - handle2: Some(handle2), + thread1: Some(thread1), + thread2: Some(thread2), state, } } @@ -709,8 +741,8 @@ impl Drop for HedgedFileSystem { fn drop(&mut self) { block_on(self.wait(Task::Stop, Task::Stop)).unwrap(); - let t1 = self.handle1.take().unwrap(); - let t2 = self.handle2.take().unwrap(); + let t1 = self.thread1.take().unwrap(); + let t2 = self.thread2.take().unwrap(); let mut times = 0; loop { // wait 1s From d343e6b6dd8dbf5a012de7e945be017877f906de Mon Sep 17 00:00:00 2001 From: Connor1996 Date: Tue, 12 Sep 2023 17:03:06 +0800 Subject: [PATCH 21/32] reuse get files for snapshot Signed-off-by: Connor1996 --- src/env/double_write.rs | 87 ++++++++++++++++------------------------- 1 file changed, 33 insertions(+), 54 deletions(-) diff --git a/src/env/double_write.rs b/src/env/double_write.rs index c5e4fdff..14a077d8 100644 --- a/src/env/double_write.rs +++ b/src/env/double_write.rs @@ -91,53 +91,18 @@ impl SeqTask { .rename(src_path, dst_path) .map(|_| TaskRes::Rename), Task::Snapshot(path) => { - let mut files = Files { - prefix: path.clone(), - ..Default::default() - }; - fs::read_dir(path) - .unwrap() - .try_for_each(|e| -> IoResult<()> { - let dir_entry = e?; - let p = dir_entry.path(); - if !p.is_file() { - return Ok(()); - } - let file_name = p.file_name().unwrap().to_str().unwrap(); - match FileId::parse_file_name(file_name) { - Some(FileId { - queue: LogQueue::Append, - seq, - }) => files.append_files.push(SeqFile::Handle(( - FileName { - seq, - path: p.to_path_buf(), - path_id: 0, - }, - Arc::new(file_system.open(&p, Permission::ReadOnly).unwrap()), - ))), - Some(FileId { - queue: LogQueue::Rewrite, - .. - }) => {} // exclude rewrite files, they are always synced - _ => { - if let Some(seq) = parse_reserved_file_name(file_name) { - files.reserved_files.push(SeqFile::Handle(( - FileName { - seq, - path: p.to_path_buf(), - path_id: 0, - }, - Arc::new( - file_system.open(&p, Permission::ReadOnly).unwrap(), - ), - ))); - } - } - } - Ok(()) - }) - .unwrap(); + let mut files = HedgedFileSystem::get_files(&path).unwrap(); // TODO: handle error + files.append_files = files + .append_files + .into_iter() + .map(|f| f.into_handle(file_system)) + .collect(); + // exclude rewrite files, as they are always synced + files.reserved_files = files + .reserved_files + .into_iter() + .map(|f| f.into_handle(file_system)) + .collect(); Ok(TaskRes::Snapshot(files)) } Task::Stop | Task::Pause => unreachable!(), @@ -232,6 +197,14 @@ impl SeqFile { } } } + + fn into_handle(mut self, file_system: &DefaultFileSystem) -> Self { + if let SeqFile::Path(f) = self { + let fd = Arc::new(file_system.open(&f.path, Permission::ReadOnly).unwrap()); + self = SeqFile::Handle((f, fd)); + } + self + } } fn replace_path(path: &Path, from: &Path, to: &Path) -> PathBuf { @@ -598,15 +571,13 @@ impl HedgedFileSystem { Ok(()) } - fn get_files(&self, path: &PathBuf) -> IoResult { + fn get_files(path: &PathBuf) -> IoResult { + assert!(path.exists()); + let mut files = Files { prefix: path.clone(), ..Default::default() }; - if !path.exists() { - info!("Create raft log directory: {}", path.display()); - fs::create_dir(path).unwrap(); - } fs::read_dir(path) .unwrap() @@ -773,8 +744,16 @@ impl Drop for HedgedFileSystem { impl RecoverExt for HedgedFileSystem { fn bootstrap(&self) -> IoResult<()> { // catch up diff - let files1 = self.get_files(&self.path1)?; - let files2 = self.get_files(&self.path2)?; + if !self.path1.exists() { + info!("Create raft log directory: {}", self.path1.display()); + fs::create_dir(&self.path1).unwrap(); + } + if !self.path2.exists() { + info!("Create raft log directory: {}", self.path2.display()); + fs::create_dir(&self.path2).unwrap(); + } + let files1 = HedgedFileSystem::get_files(&self.path1)?; + let files2 = HedgedFileSystem::get_files(&self.path2)?; let count1 = self.get_latest_valid_seq(&files1)?; let count2 = self.get_latest_valid_seq(&files2)?; From fb31a9b93f263cd031886e41e39ab759eb35b2c9 Mon Sep 17 00:00:00 2001 From: Connor1996 Date: Tue, 12 Sep 2023 17:53:11 +0800 Subject: [PATCH 22/32] reimpl recover Signed-off-by: Connor1996 --- src/env/double_write.rs | 224 +++++++++++++++++----------------------- 1 file changed, 96 insertions(+), 128 deletions(-) diff --git a/src/env/double_write.rs b/src/env/double_write.rs index 14a077d8..f3e2d339 100644 --- a/src/env/double_write.rs +++ b/src/env/double_write.rs @@ -23,7 +23,6 @@ use std::sync::atomic::AtomicU64; use std::sync::atomic::Ordering; use std::sync::Arc; use std::sync::Mutex; -use std::sync::RwLock; use std::thread; use std::thread::JoinHandle; @@ -71,12 +70,12 @@ enum Task { size: usize, }, Pause, - Snapshot(PathBuf), + Snapshot, Stop, } impl SeqTask { - fn process(self, file_system: &DefaultFileSystem) -> IoResult { + fn process(self, file_system: &DefaultFileSystem, path: &PathBuf) -> IoResult { match self.inner { Task::Create(path) => file_system.create(&path).map(|h| TaskRes::Create { fd: h, @@ -90,7 +89,7 @@ impl SeqTask { Task::Rename { src_path, dst_path } => file_system .rename(src_path, dst_path) .map(|_| TaskRes::Rename), - Task::Snapshot(path) => { + Task::Snapshot => { let mut files = HedgedFileSystem::get_files(&path).unwrap(); // TODO: handle error files.append_files = files .append_files @@ -103,7 +102,7 @@ impl SeqTask { .into_iter() .map(|f| f.into_handle(file_system)) .collect(); - Ok(TaskRes::Snapshot(files)) + Ok(TaskRes::Snapshot((self.seq, files))) } Task::Stop | Task::Pause => unreachable!(), _ => self.handle_process(), @@ -148,7 +147,7 @@ enum TaskRes { Sync, Write(usize), Allocate, - Snapshot(Files), + Snapshot((u64, Files)), Stop, } @@ -228,29 +227,24 @@ struct HedgedSenderInner { disk1: Sender<(SeqTask, Callback)>, disk2: Sender<(SeqTask, Callback)>, seq: u64, - state: Arc>, + state: RecoveryState, } impl HedgedSender { fn new( disk1: Sender<(SeqTask, Callback)>, disk2: Sender<(SeqTask, Callback)>, - state: Arc>, ) -> Self { Self(Arc::new(Mutex::new(HedgedSenderInner { disk1, disk2, seq: 0, - state, + state: RecoveryState::Normal, }))) } - fn state(&self) -> Arc> { - self.0.lock().unwrap().state.clone() - } - fn send(&self, task1: Task, task2: Task, cb1: Callback, cb2: Callback) { - if matches!(task1, Task::Pause | Task::Snapshot(_)) { + if matches!(task1, Task::Pause | Task::Snapshot) { unreachable!(); } @@ -264,8 +258,7 @@ impl HedgedSender { inner: task2, seq: inner.seq, }; - let state = inner.state.read().unwrap(); - if matches!(*state, RecoveryState::Normal) { + if matches!(inner.state, RecoveryState::Normal) { let check1 = inner.disk1.len() > ABORT_THRESHOLD; let check2 = inner.disk2.len() > ABORT_THRESHOLD; match (check1, check2) { @@ -273,7 +266,7 @@ impl HedgedSender { panic!("Both channels of disk1 and disk2 are full") } (true, false) => { - *inner.state.write().unwrap() = RecoveryState::Paused1; + inner.state = RecoveryState::Paused1; inner .disk1 .send(( @@ -286,7 +279,7 @@ impl HedgedSender { .unwrap(); } (false, true) => { - *inner.state.write().unwrap() = RecoveryState::Paused2; + inner.state = RecoveryState::Paused2; inner .disk2 .send(( @@ -301,47 +294,36 @@ impl HedgedSender { _ => {} } } - if !matches!( - *state, - RecoveryState::Paused1 | RecoveryState::WaitRecover1(_) - ) { + if !matches!(inner.state, RecoveryState::Paused1) { inner.disk1.send((task1, cb1)).unwrap(); } - if !matches!( - *state, - RecoveryState::Paused2 | RecoveryState::WaitRecover2(_) - ) { + if !matches!(inner.state, RecoveryState::Paused2) { inner.disk2.send((task2, cb2)).unwrap(); } } - fn send_snapshot(&self, index: u8, task: Task, cb: Callback) { - assert!(matches!(task, Task::Snapshot(_))); - - let inner = self.0.lock().unwrap(); - if index == 1 { - inner - .disk1 - .send(( - SeqTask { - inner: task, - seq: 0, - }, - cb, - )) - .unwrap(); - } else { - inner - .disk2 - .send(( - SeqTask { - inner: task, - seq: 0, - }, - cb, - )) - .unwrap(); + fn send_snapshot(&self, cb: Callback) { + let mut inner = self.0.lock().unwrap(); + inner.seq += 1; + let task = SeqTask { + inner: Task::Snapshot, + seq: inner.seq, + }; + match inner.state { + RecoveryState::Paused1 => { + inner.disk1.send((task, cb)).unwrap(); + } + RecoveryState::Paused2 => { + inner.disk2.send((task, cb)).unwrap(); + } + _ => unreachable!(), } + inner.state = RecoveryState::Recovering; + } + + fn finish_snapshot(&self) { + let mut inner = self.0.lock().unwrap(); + inner.state = RecoveryState::Normal; } } @@ -351,8 +333,6 @@ enum RecoveryState { * `Pause` task is sent and no more later task will be sent * to disk1 */ Paused2, // no more task will be sent to disk2 - WaitRecover1(oneshot::Sender<()>), - WaitRecover2(oneshot::Sender<()>), Recovering, } @@ -369,8 +349,6 @@ pub struct HedgedFileSystem { thread1: Option>, thread2: Option>, - - state: Arc>, } struct TaskRunner { @@ -378,8 +356,8 @@ struct TaskRunner { path: PathBuf, fs: Arc, rx: Receiver<(SeqTask, Callback)>, + sender: HedgedSender, seqno: Arc, - state: Arc>, } impl TaskRunner { @@ -388,16 +366,16 @@ impl TaskRunner { path: PathBuf, fs: Arc, rx: Receiver<(SeqTask, Callback)>, + sender: HedgedSender, seqno: Arc, - state: Arc>, ) -> Self { Self { id, path, fs, rx, + sender, seqno, - state, } } @@ -405,35 +383,66 @@ impl TaskRunner { thread::Builder::new() .name(format!("raft-engine-disk{}", self.id)) .spawn(move || { + let mut last_seq = 0; + let mut snap_seq = None; for (task, cb) in self.rx { if let Task::Stop = task.inner { cb(Ok(TaskRes::Stop)); break; } if let Task::Pause = task.inner { - let (tx, rx) = oneshot::channel(); - *self.state.write().unwrap() = if self.id == 1 { - RecoveryState::WaitRecover1(tx) - } else { - RecoveryState::WaitRecover2(tx) - }; - let _ = block_on(rx); - // indicate the pause is done - // do not update seqno for pause task + // Encountering `Pause`, indicate the disk may not slow anymore + let (cb, f) = paired_future_callback(); + self.sender.send_snapshot(cb); + let to_files = HedgedFileSystem::get_files(&self.path).unwrap(); // TODO: handle error + let from_files = block_on(f) + .unwrap() + .map(|res| { + if let TaskRes::Snapshot((seq, files)) = res { + snap_seq = Some(seq); + files + } else { + unreachable!() + } + }) + .unwrap(); // TODO: handle error + + // Snapshot doesn't include the file size, so it would copy more data than + // the data seen at the time of snapshot. But it's okay, as the data is + // written with specific offset, so the data written + // of no necessity will be overwritten by the latter writes. + // Exclude rewrite files because rewrite files are always synced. + HedgedFileSystem::catch_up_diff(&self.fs, from_files, to_files, true); + + self.sender.finish_snapshot(); + self.seqno.store(snap_seq.unwrap(), Ordering::Relaxed); + last_seq = snap_seq.unwrap(); continue; } if self.id == 1 { fail_point!("double_write::thread1"); } let seq = task.seq; - let res = task.process(&self.fs); + assert_ne!(seq, 0); + if let Some(snap) = snap_seq.as_ref() { + // the change already included in the snapshot + if seq + 1 < *snap { + } else if seq + 1 == *snap { + snap_seq = None; + } else { + panic!("seqno {} is larger than snapshot seqno {}", seq, *snap); + } + continue; + } + + assert_eq!(last_seq + 1, seq); + last_seq = seq; + let res = task.process(&self.fs, &self.path); // seqno should be updated before the write callback is called, otherwise one // read may be performed right after the write is finished. Then the read may be // performed on the other disk not having the data because the seqno for this // disk is not updated yet. - if seq != 0 { - self.seqno.store(seq, Ordering::Relaxed); - } + self.seqno.store(seq, Ordering::Relaxed); cb(res); } }) @@ -447,7 +456,8 @@ impl HedgedFileSystem { pub fn new(base: Arc, path1: PathBuf, path2: PathBuf) -> Self { let (tx1, rx1) = unbounded::<(SeqTask, Callback)>(); let (tx2, rx2) = unbounded::<(SeqTask, Callback)>(); - let state = Arc::new(RwLock::new(RecoveryState::Normal)); + let sender = HedgedSender::new(tx1, tx2); + let seqno1 = Arc::new(AtomicU64::new(0)); let seqno2 = Arc::new(AtomicU64::new(0)); let runner1 = TaskRunner::new( @@ -455,20 +465,19 @@ impl HedgedFileSystem { path1.clone(), base.clone(), rx1, + sender.clone(), seqno1.clone(), - state.clone(), ); let runner2 = TaskRunner::new( 2, path2.clone(), base.clone(), rx2, + sender.clone(), seqno2.clone(), - state.clone(), ); let thread1 = runner1.spawn(); let thread2 = runner2.spawn(); - let sender = HedgedSender::new(tx1, tx2, state.clone()); Self { base, path1, @@ -478,12 +487,11 @@ impl HedgedFileSystem { seqno2, thread1: Some(thread1), thread2: Some(thread2), - state, } } fn catch_up_diff( - &self, + fs: &Arc, mut from_files: Files, mut to_files: Files, skip_rewrite: bool, @@ -519,7 +527,7 @@ impl HedgedFileSystem { from_files.prefix.as_ref(), to_files.prefix.as_ref(), ); - f1.copy(&self.base, &to)?; + f1.copy(fs, &to)?; iter1.next(); } (None, Some(f2)) => { @@ -538,7 +546,7 @@ impl HedgedFileSystem { from_files.prefix.as_ref(), to_files.prefix.as_ref(), ); - f1.copy(&self.base, &to)?; + f1.copy(fs, &to)?; } iter1.next(); iter2.next(); @@ -549,7 +557,7 @@ impl HedgedFileSystem { from_files.prefix.as_ref(), to_files.prefix.as_ref(), ); - f1.copy(&self.base, &to)?; + f1.copy(fs, &to)?; iter1.next(); } std::cmp::Ordering::Greater => { @@ -761,68 +769,28 @@ impl RecoverExt for HedgedFileSystem { match count1.cmp(&count2) { std::cmp::Ordering::Equal => { // still need to catch up, but only diff - self.catch_up_diff(files1, files2, false)?; + HedgedFileSystem::catch_up_diff(&self.base, files1, files2, false)?; return Ok(()); } std::cmp::Ordering::Less => { - self.catch_up_diff(files2, files1, false)?; + HedgedFileSystem::catch_up_diff(&self.base, files2, files1, false)?; } std::cmp::Ordering::Greater => { - self.catch_up_diff(files1, files2, false)?; + HedgedFileSystem::catch_up_diff(&self.base, files1, files2, false)?; } } Ok(()) } fn need_recover(&self) -> bool { - // in wait recover state, the task should be still dropped - let res = matches!( - *self.state.read().unwrap(), - RecoveryState::WaitRecover1(_) | RecoveryState::WaitRecover2(_) - ); - if res { - // in recovering stat, the task can keep sending - *self.state.write().unwrap() = RecoveryState::Recovering; - } - res + false } fn is_in_recover(&self) -> bool { - matches!(*self.state.read().unwrap(), RecoveryState::Recovering) + false } - fn trigger_recover(&self) { - // let (cb, f) = paired_future_callback(); - // let to_files = match *self.state.read().unwrap() { - // RecoveryState::WaitRecover1(tx) => { - // self.sender - // .send_snapshot(1, Task::Snapshot(self.path2.clone()), cb); - // self.get_files(&self.path1).unwrap() // TODO: handle error - // } - // RecoveryState::WaitRecover2(tx) => { - // self.sender - // .send_snapshot(2, Task::Snapshot(self.path1.clone()), cb); - // self.get_files(&self.path2).unwrap() - // } - // _ => unreachable!(), - // }; - - // let from_files = block_on(f).unwrap().map(|res| { - // if let TaskRes::Snapshot(files) = res { - // files - // } else { - // unreachable!() - // } - // }).unwrap(); // TODO: handle error - - // TODO: async - // exclude rewrite files because rewrite files are always synced - // self.catch_up_diff(from_files, to_files, true); - - // when - *self.state.write().unwrap() = RecoveryState::Normal; - // tx.send(()).unwrap(); - } + fn trigger_recover(&self) {} } impl FileSystem for HedgedFileSystem { @@ -1017,7 +985,7 @@ impl HedgedHandle { cb(res); } })); - sender = HedgedSender::new(tx1, tx2, sender.state()); + sender = HedgedSender::new(tx1, tx2); } Self { From 5a955f89a8b06e211c86805edb4b77e6bdf30e4c Mon Sep 17 00:00:00 2001 From: Connor1996 Date: Tue, 12 Sep 2023 18:11:20 +0800 Subject: [PATCH 23/32] rename Signed-off-by: Connor1996 --- src/env/{double_write.rs => hedged.rs} | 6 ++++++ src/env/mod.rs | 4 ++-- 2 files changed, 8 insertions(+), 2 deletions(-) rename src/env/{double_write.rs => hedged.rs} (99%) diff --git a/src/env/double_write.rs b/src/env/hedged.rs similarity index 99% rename from src/env/double_write.rs rename to src/env/hedged.rs index f3e2d339..c8b63734 100644 --- a/src/env/double_write.rs +++ b/src/env/hedged.rs @@ -37,6 +37,12 @@ use either::Either; type Callback = Box) + Send>; +// TODO: handle error and abrupt +// TODO: add metrics +// TODO: handle specially on config change(upgrade and downgrade) +// TODO: remove recover ext +// TODO: add comment and rename + struct SeqTask { inner: Task, seq: u64, diff --git a/src/env/mod.rs b/src/env/mod.rs index d618cedb..835c0981 100644 --- a/src/env/mod.rs +++ b/src/env/mod.rs @@ -5,11 +5,11 @@ use std::path::Path; use std::sync::Arc; mod default; -mod double_write; +mod hedged; mod obfuscated; pub use default::DefaultFileSystem; -pub use double_write::HedgedFileSystem; +pub use hedged::HedgedFileSystem; pub use obfuscated::ObfuscatedFileSystem; #[derive(Clone, Copy, PartialEq, Eq, Debug)] From cef9add728e503000e28c9c590e7664ba1feb8e5 Mon Sep 17 00:00:00 2001 From: Connor1996 Date: Tue, 12 Sep 2023 19:54:34 +0800 Subject: [PATCH 24/32] fix snapshot Signed-off-by: Connor1996 --- src/env/hedged.rs | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/env/hedged.rs b/src/env/hedged.rs index c8b63734..e839430d 100644 --- a/src/env/hedged.rs +++ b/src/env/hedged.rs @@ -317,10 +317,10 @@ impl HedgedSender { }; match inner.state { RecoveryState::Paused1 => { - inner.disk1.send((task, cb)).unwrap(); + inner.disk2.send((task, cb)).unwrap(); } RecoveryState::Paused2 => { - inner.disk2.send((task, cb)).unwrap(); + inner.disk1.send((task, cb)).unwrap(); } _ => unreachable!(), } @@ -432,13 +432,15 @@ impl TaskRunner { assert_ne!(seq, 0); if let Some(snap) = snap_seq.as_ref() { // the change already included in the snapshot - if seq + 1 < *snap { - } else if seq + 1 == *snap { + if seq < *snap { + continue; + } else if seq == *snap { + unreachable!(); + } else if seq == *snap + 1 { snap_seq = None; } else { panic!("seqno {} is larger than snapshot seqno {}", seq, *snap); } - continue; } assert_eq!(last_seq + 1, seq); From 135ae19f163188d608941bbfc56d293a7efbbfac Mon Sep 17 00:00:00 2001 From: Connor1996 Date: Wed, 13 Sep 2023 17:05:46 +0800 Subject: [PATCH 25/32] add recover case and fix cancealed rx Signed-off-by: Connor1996 --- src/env/hedged.rs | 124 ++++++++++++++++++++++---------- src/env/mod.rs | 2 + tests/failpoints/test_engine.rs | 59 ++++++++++++++- 3 files changed, 146 insertions(+), 39 deletions(-) diff --git a/src/env/hedged.rs b/src/env/hedged.rs index e839430d..80304a3f 100644 --- a/src/env/hedged.rs +++ b/src/env/hedged.rs @@ -29,7 +29,7 @@ use std::thread::JoinHandle; use crate::env::default::LogFd; use crate::env::DefaultFileSystem; use crate::env::{FileSystem, Handle, Permission, WriteExt}; -use futures::channel::oneshot; +use futures::channel::oneshot::{self, Canceled}; use futures::executor::block_on; use futures::{join, select}; @@ -48,6 +48,7 @@ struct SeqTask { seq: u64, } +#[derive(Clone)] enum Task { Create(PathBuf), Open { @@ -111,23 +112,27 @@ impl SeqTask { Ok(TaskRes::Snapshot((self.seq, files))) } Task::Stop | Task::Pause => unreachable!(), - _ => self.handle_process(), + _ => self.handle_process(file_system), } } - fn handle_process(self) -> IoResult { + fn handle_process(self, file_system: &DefaultFileSystem) -> IoResult { match self.inner { - Task::Truncate { handle, offset } => { - handle.get().truncate(offset).map(|_| TaskRes::Truncate) - } - Task::FileSize(handle) => handle.get().file_size().map(|s| TaskRes::FileSize(s)), - Task::Sync(handle) => handle.get().sync().map(|_| TaskRes::Sync), + Task::Truncate { handle, offset } => handle + .get(file_system) + .truncate(offset) + .map(|_| TaskRes::Truncate), + Task::FileSize(handle) => handle + .get(file_system) + .file_size() + .map(|s| TaskRes::FileSize(s)), + Task::Sync(handle) => handle.get(file_system).sync().map(|_| TaskRes::Sync), Task::Write { handle, offset, bytes, } => handle - .get() + .get(file_system) .write(offset, &bytes) .map(|s| TaskRes::Write(s)), Task::Allocate { @@ -135,7 +140,7 @@ impl SeqTask { offset, size, } => handle - .get() + .get(file_system) .allocate(offset, size) .map(|_| TaskRes::Allocate), _ => unreachable!(), @@ -222,7 +227,15 @@ fn replace_path(path: &Path, from: &Path, to: &Path) -> PathBuf { // let say the average entry size is 100B, then the total size of the log in the // channel is 1GB, -const ABORT_THRESHOLD: usize = 10000; +const PAUSE_THRESHOLD: usize = 10000; + +fn get_pause_threshold() -> usize { + fail_point!("hedged::pause_threshold", |s| s + .unwrap() + .parse::() + .unwrap()); + PAUSE_THRESHOLD +} // Make sure the task is sent to two disks' channel atomically, otherwise the // ordering of the tasks in two disks' channels are not same. @@ -233,7 +246,7 @@ struct HedgedSenderInner { disk1: Sender<(SeqTask, Callback)>, disk2: Sender<(SeqTask, Callback)>, seq: u64, - state: RecoveryState, + state: State, } impl HedgedSender { @@ -245,10 +258,14 @@ impl HedgedSender { disk1, disk2, seq: 0, - state: RecoveryState::Normal, + state: State::Normal, }))) } + fn state(&self) -> State { + self.0.lock().unwrap().state.clone() + } + fn send(&self, task1: Task, task2: Task, cb1: Callback, cb2: Callback) { if matches!(task1, Task::Pause | Task::Snapshot) { unreachable!(); @@ -264,15 +281,15 @@ impl HedgedSender { inner: task2, seq: inner.seq, }; - if matches!(inner.state, RecoveryState::Normal) { - let check1 = inner.disk1.len() > ABORT_THRESHOLD; - let check2 = inner.disk2.len() > ABORT_THRESHOLD; + if matches!(inner.state, State::Normal) { + let check1 = inner.disk1.len() > get_pause_threshold(); + let check2 = inner.disk2.len() > get_pause_threshold(); match (check1, check2) { (true, true) => { panic!("Both channels of disk1 and disk2 are full") } (true, false) => { - inner.state = RecoveryState::Paused1; + inner.state = State::Paused1; inner .disk1 .send(( @@ -285,7 +302,7 @@ impl HedgedSender { .unwrap(); } (false, true) => { - inner.state = RecoveryState::Paused2; + inner.state = State::Paused2; inner .disk2 .send(( @@ -300,10 +317,10 @@ impl HedgedSender { _ => {} } } - if !matches!(inner.state, RecoveryState::Paused1) { + if !matches!(inner.state, State::Paused1) { inner.disk1.send((task1, cb1)).unwrap(); } - if !matches!(inner.state, RecoveryState::Paused2) { + if !matches!(inner.state, State::Paused2) { inner.disk2.send((task2, cb2)).unwrap(); } } @@ -316,24 +333,25 @@ impl HedgedSender { seq: inner.seq, }; match inner.state { - RecoveryState::Paused1 => { + State::Paused1 => { inner.disk2.send((task, cb)).unwrap(); } - RecoveryState::Paused2 => { + State::Paused2 => { inner.disk1.send((task, cb)).unwrap(); } _ => unreachable!(), } - inner.state = RecoveryState::Recovering; + inner.state = State::Recovering; } fn finish_snapshot(&self) { let mut inner = self.0.lock().unwrap(); - inner.state = RecoveryState::Normal; + inner.state = State::Normal; } } -enum RecoveryState { +#[derive(Debug, PartialEq, Clone)] +pub enum State { Normal, Paused1, /* When the length of channel of disk1 reaches threshold, a * `Pause` task is sent and no more later task will be sent @@ -426,7 +444,7 @@ impl TaskRunner { continue; } if self.id == 1 { - fail_point!("double_write::thread1"); + fail_point!("hedged::task_runner::thread1"); } let seq = task.seq; assert_ne!(seq, 0); @@ -498,6 +516,10 @@ impl HedgedFileSystem { } } + pub fn state(&self) -> State { + self.sender.state() + } + fn catch_up_diff( fs: &Arc, mut from_files: Files, @@ -677,7 +699,7 @@ impl HedgedFileSystem { async fn wait_handle(&self, task1: Task, task2: Task) -> IoResult { let (cb1, mut f1) = paired_future_callback(); let (cb2, mut f2) = paired_future_callback(); - self.sender.send(task1, task2, cb1, cb2); + self.sender.send(task1.clone(), task2.clone(), cb1, cb2); let resolve = |res: TaskRes| -> (LogFd, bool) { match res { @@ -690,10 +712,11 @@ impl HedgedFileSystem { res1 = f1 => res1.unwrap().map(|res| { let (fd, is_for_rewrite) = resolve(res); HedgedHandle::new( + self.base.clone(), is_for_rewrite, self.sender.clone(), FutureHandle::new_owned(fd), - FutureHandle::new(f2), + FutureHandle::new(f2, task2), self.seqno1.clone(), self.seqno2.clone(), ) @@ -701,9 +724,10 @@ impl HedgedFileSystem { res2 = f2 => res2.unwrap().map(|res| { let (fd, is_for_rewrite) = resolve(res); HedgedHandle::new( + self.base.clone(), is_for_rewrite, self.sender.clone(), - FutureHandle::new(f1), + FutureHandle::new(f1, task1), FutureHandle::new_owned(fd) , self.seqno1.clone(), self.seqno2.clone(), @@ -865,6 +889,7 @@ impl FileSystem for HedgedFileSystem { pub struct FutureHandle { inner: UnsafeCell>, Arc>>, + task: Option, } unsafe impl Send for FutureHandle {} @@ -878,27 +903,47 @@ unsafe impl Send for FutureHandle {} unsafe impl Sync for FutureHandle {} impl FutureHandle { - fn new(rx: oneshot::Receiver>) -> Self { + fn new(rx: oneshot::Receiver>, task: Task) -> Self { Self { inner: UnsafeCell::new(Either::Left(rx)), + task: Some(task), } } fn new_owned(h: LogFd) -> Self { Self { inner: UnsafeCell::new(Either::Right(Arc::new(h))), + task: None, } } - fn get(&self) -> Arc { + fn get(&self, file_system: &DefaultFileSystem) -> Arc { let mut set = false; let fd = match unsafe { &mut *self.inner.get() } { Either::Left(rx) => { set = true; // TODO: should we handle the second disk io error - match block_on(rx).unwrap().unwrap() { - TaskRes::Open { fd, .. } => Arc::new(fd), - TaskRes::Create { fd, .. } => Arc::new(fd), - _ => unreachable!(), + match block_on(rx) { + Err(Canceled) => { + // Canceled is caused by the task is dropped when in paused state, + // so we should retry the task now + match self.task.as_ref().unwrap() { + Task::Create(path) => { + // has been already created, so just open + let fd = file_system.open(path, Permission::ReadWrite).unwrap(); // TODO: handle error + Arc::new(fd) + } + Task::Open { path, perm } => { + let fd = file_system.open(path, *perm).unwrap(); // TODO: handle error + Arc::new(fd) + } + _ => unreachable!(), + } + } + Ok(res) => match res.unwrap() { + TaskRes::Open { fd, .. } => Arc::new(fd), + TaskRes::Create { fd, .. } => Arc::new(fd), + _ => unreachable!(), + }, } } Either::Right(w) => w.clone(), @@ -953,6 +998,7 @@ pub struct HedgedHandle { impl HedgedHandle { fn new( + base: Arc, strong_consistent: bool, mut sender: HedgedSender, handle1: FutureHandle, @@ -972,23 +1018,27 @@ impl HedgedHandle { seqno1 = Arc::new(AtomicU64::new(0)); seqno2 = Arc::new(AtomicU64::new(0)); let seqno1_clone = seqno1.clone(); + let fs1 = base.clone(); thread1 = Some(thread::spawn(move || { for (task, cb) in rx1 { if let Task::Stop = task.inner { break; } - let res = task.handle_process(); + assert!(!matches!(task.inner, Task::Pause | Task::Snapshot)); + let res = task.handle_process(&fs1); seqno1_clone.fetch_add(1, Ordering::Relaxed); cb(res); } })); let seqno2_clone = seqno2.clone(); + let fs2 = base; thread2 = Some(thread::spawn(move || { for (task, cb) in rx2 { if let Task::Stop = task.inner { break; } - let res = task.handle_process(); + assert!(!matches!(task.inner, Task::Pause | Task::Snapshot)); + let res = task.handle_process(&fs2); seqno2_clone.fetch_add(1, Ordering::Relaxed); cb(res); } diff --git a/src/env/mod.rs b/src/env/mod.rs index 835c0981..c7c76ac0 100644 --- a/src/env/mod.rs +++ b/src/env/mod.rs @@ -12,6 +12,8 @@ pub use default::DefaultFileSystem; pub use hedged::HedgedFileSystem; pub use obfuscated::ObfuscatedFileSystem; +pub use hedged::State; + #[derive(Clone, Copy, PartialEq, Eq, Debug)] pub enum Permission { ReadOnly, diff --git a/tests/failpoints/test_engine.rs b/tests/failpoints/test_engine.rs index 58b8d5ff..ecb925c2 100644 --- a/tests/failpoints/test_engine.rs +++ b/tests/failpoints/test_engine.rs @@ -1206,6 +1206,61 @@ fn number_of_files(p: &Path) -> usize { r } +#[test] +fn test_start_engine_with_slow_second_disk_recover() { + let dir = tempfile::Builder::new() + .prefix("test_start_engine_with_slow_second_disk_default") + .tempdir() + .unwrap(); + let sec_dir = tempfile::Builder::new() + .prefix("test_start_engine_with_slow_second_disk_second") + .tempdir() + .unwrap(); + + let file_system = Arc::new(HedgedFileSystem::new( + Arc::new(DefaultFileSystem {}), + dir.path().to_path_buf(), + sec_dir.path().to_path_buf(), + )); + let entry_data = vec![b'x'; 512]; + + // Preparations for multi-dirs. + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + enable_log_recycle: false, + target_file_size: ReadableSize(1), + ..Default::default() + }; + + fail::cfg("hedged::pause_threshold", "return(10)").unwrap(); + // Step 1: write data into the main directory. + let engine = Engine::open_with_file_system(cfg.clone(), file_system.clone()).unwrap(); + fail::cfg("hedged::task_runner::thread1", "pause").unwrap(); + for rid in 1..=20 { + append(&engine, rid, 1, 10, Some(&entry_data)); + } + for rid in 1..=20 { + assert_eq!(engine.first_index(rid).unwrap(), 1); + } + assert_ne!(number_of_files(sec_dir.path()), number_of_files(dir.path())); + fail::remove("hedged::task_runner::thread1"); + let mut times = 0; + loop { + if number_of_files(sec_dir.path()) == number_of_files(dir.path()) { + break; + } + if times > 50 { + panic!("rewrite queue is not finished"); + } + times += 1; + std::thread::sleep(Duration::from_millis(10)); + } + assert_eq!(file_system.state(), env::State::Normal); + drop(file_system); + drop(engine); + assert_eq!(number_of_files(sec_dir.path()), number_of_files(dir.path())); +} + #[test] fn test_start_engine_with_slow_second_disk() { let dir = tempfile::Builder::new() @@ -1234,7 +1289,7 @@ fn test_start_engine_with_slow_second_disk() { // Step 1: write data into the main directory. let engine = Engine::open_with_file_system(cfg.clone(), file_system).unwrap(); - fail::cfg("double_write::thread1", "pause").unwrap(); + fail::cfg("hedged::task_runner::thread1", "pause").unwrap(); for rid in 1..=10 { append(&engine, rid, 1, 10, Some(&entry_data)); } @@ -1242,7 +1297,7 @@ fn test_start_engine_with_slow_second_disk() { assert_eq!(engine.first_index(rid).unwrap(), 1); } assert_ne!(number_of_files(sec_dir.path()), number_of_files(dir.path())); - fail::remove("double_write::thread1"); + fail::remove("hedged::task_runner::thread1"); drop(engine); assert_eq!(number_of_files(sec_dir.path()), number_of_files(dir.path())); From df907f06c2753418b2170a53d334c45aa64c721f Mon Sep 17 00:00:00 2001 From: Connor1996 Date: Thu, 14 Sep 2023 18:27:02 +0800 Subject: [PATCH 26/32] handle background error and panic Signed-off-by: Connor1996 --- src/env/hedged.rs | 318 ++++++++++++++++++++++++---------------------- 1 file changed, 165 insertions(+), 153 deletions(-) diff --git a/src/env/hedged.rs b/src/env/hedged.rs index 80304a3f..aac23c59 100644 --- a/src/env/hedged.rs +++ b/src/env/hedged.rs @@ -13,7 +13,7 @@ use crate::Error; use crossbeam::channel::unbounded; use crossbeam::channel::{Receiver, Sender}; use fail::fail_point; -use log::{info, warn}; +use log::info; use std::cell::UnsafeCell; use std::fs; use std::io::{Read, Result as IoResult, Seek, SeekFrom, Write}; @@ -35,9 +35,23 @@ use futures::{join, select}; use either::Either; -type Callback = Box) + Send>; +type Callback = Box) -> IoResult + Send>; + +fn empty_callback() -> Callback { + Box::new(|_| Ok(TaskRes::Noop)) +} + +fn paired_future_callback() -> (Callback, oneshot::Receiver>) { + let (tx, future) = oneshot::channel(); + let callback = Box::new(move |result| -> IoResult { + if let Err(result) = tx.send(result) { + return result; + } + Ok(TaskRes::Noop) + }); + (callback, future) +} -// TODO: handle error and abrupt // TODO: add metrics // TODO: handle specially on config change(upgrade and downgrade) // TODO: remove recover ext @@ -97,7 +111,7 @@ impl SeqTask { .rename(src_path, dst_path) .map(|_| TaskRes::Rename), Task::Snapshot => { - let mut files = HedgedFileSystem::get_files(&path).unwrap(); // TODO: handle error + let mut files = HedgedFileSystem::get_files(&path)?; files.append_files = files .append_files .into_iter() @@ -119,20 +133,20 @@ impl SeqTask { fn handle_process(self, file_system: &DefaultFileSystem) -> IoResult { match self.inner { Task::Truncate { handle, offset } => handle - .get(file_system) + .get(file_system)? .truncate(offset) .map(|_| TaskRes::Truncate), Task::FileSize(handle) => handle - .get(file_system) + .get(file_system)? .file_size() .map(|s| TaskRes::FileSize(s)), - Task::Sync(handle) => handle.get(file_system).sync().map(|_| TaskRes::Sync), + Task::Sync(handle) => handle.get(file_system)?.sync().map(|_| TaskRes::Sync), Task::Write { handle, offset, bytes, } => handle - .get(file_system) + .get(file_system)? .write(offset, &bytes) .map(|s| TaskRes::Write(s)), Task::Allocate { @@ -140,7 +154,7 @@ impl SeqTask { offset, size, } => handle - .get(file_system) + .get(file_system)? .allocate(offset, size) .map(|_| TaskRes::Allocate), _ => unreachable!(), @@ -149,6 +163,7 @@ impl SeqTask { } enum TaskRes { + Noop, Create { fd: LogFd, is_for_rewrite: bool }, Open { fd: LogFd, is_for_rewrite: bool }, Delete, @@ -243,17 +258,14 @@ fn get_pause_threshold() -> usize { struct HedgedSender(Arc>); struct HedgedSenderInner { - disk1: Sender<(SeqTask, Callback)>, - disk2: Sender<(SeqTask, Callback)>, + disk1: Sender<(SeqTask, Callback)>, + disk2: Sender<(SeqTask, Callback)>, seq: u64, state: State, } impl HedgedSender { - fn new( - disk1: Sender<(SeqTask, Callback)>, - disk2: Sender<(SeqTask, Callback)>, - ) -> Self { + fn new(disk1: Sender<(SeqTask, Callback)>, disk2: Sender<(SeqTask, Callback)>) -> Self { Self(Arc::new(Mutex::new(HedgedSenderInner { disk1, disk2, @@ -266,7 +278,7 @@ impl HedgedSender { self.0.lock().unwrap().state.clone() } - fn send(&self, task1: Task, task2: Task, cb1: Callback, cb2: Callback) { + fn send(&self, task1: Task, task2: Task, cb1: Callback, cb2: Callback) { if matches!(task1, Task::Pause | Task::Snapshot) { unreachable!(); } @@ -297,7 +309,7 @@ impl HedgedSender { inner: Task::Pause, seq: 0, }, - Box::new(|_| {}), + empty_callback(), )) .unwrap(); } @@ -310,7 +322,7 @@ impl HedgedSender { inner: Task::Pause, seq: 0, }, - Box::new(|_| {}), + empty_callback(), )) .unwrap(); } @@ -325,7 +337,7 @@ impl HedgedSender { } } - fn send_snapshot(&self, cb: Callback) { + fn send_snapshot(&self, cb: Callback) { let mut inner = self.0.lock().unwrap(); inner.seq += 1; let task = SeqTask { @@ -379,7 +391,7 @@ struct TaskRunner { id: u8, path: PathBuf, fs: Arc, - rx: Receiver<(SeqTask, Callback)>, + rx: Receiver<(SeqTask, Callback)>, sender: HedgedSender, seqno: Arc, } @@ -389,7 +401,7 @@ impl TaskRunner { id: u8, path: PathBuf, fs: Arc, - rx: Receiver<(SeqTask, Callback)>, + rx: Receiver<(SeqTask, Callback)>, sender: HedgedSender, seqno: Arc, ) -> Self { @@ -404,84 +416,89 @@ impl TaskRunner { } fn spawn(self) -> JoinHandle<()> { + let id = self.id; thread::Builder::new() - .name(format!("raft-engine-disk{}", self.id)) + .name(format!("raft-engine-disk{}", id)) .spawn(move || { - let mut last_seq = 0; - let mut snap_seq = None; - for (task, cb) in self.rx { - if let Task::Stop = task.inner { - cb(Ok(TaskRes::Stop)); - break; - } - if let Task::Pause = task.inner { - // Encountering `Pause`, indicate the disk may not slow anymore - let (cb, f) = paired_future_callback(); - self.sender.send_snapshot(cb); - let to_files = HedgedFileSystem::get_files(&self.path).unwrap(); // TODO: handle error - let from_files = block_on(f) - .unwrap() - .map(|res| { - if let TaskRes::Snapshot((seq, files)) = res { - snap_seq = Some(seq); - files - } else { - unreachable!() - } - }) - .unwrap(); // TODO: handle error - - // Snapshot doesn't include the file size, so it would copy more data than - // the data seen at the time of snapshot. But it's okay, as the data is - // written with specific offset, so the data written - // of no necessity will be overwritten by the latter writes. - // Exclude rewrite files because rewrite files are always synced. - HedgedFileSystem::catch_up_diff(&self.fs, from_files, to_files, true); - - self.sender.finish_snapshot(); - self.seqno.store(snap_seq.unwrap(), Ordering::Relaxed); - last_seq = snap_seq.unwrap(); - continue; - } - if self.id == 1 { - fail_point!("hedged::task_runner::thread1"); - } - let seq = task.seq; - assert_ne!(seq, 0); - if let Some(snap) = snap_seq.as_ref() { - // the change already included in the snapshot - if seq < *snap { - continue; - } else if seq == *snap { - unreachable!(); - } else if seq == *snap + 1 { - snap_seq = None; - } else { - panic!("seqno {} is larger than snapshot seqno {}", seq, *snap); - } - } - - assert_eq!(last_seq + 1, seq); - last_seq = seq; - let res = task.process(&self.fs, &self.path); - // seqno should be updated before the write callback is called, otherwise one - // read may be performed right after the write is finished. Then the read may be - // performed on the other disk not having the data because the seqno for this - // disk is not updated yet. - self.seqno.store(seq, Ordering::Relaxed); - cb(res); + if let Err(e) = self.poll() { + panic!("disk {} failed: {:?}", id, e); } }) .unwrap() } + + fn poll(self) -> IoResult<()> { + let mut last_seq = 0; + let mut snap_seq = None; + for (task, cb) in self.rx { + if let Task::Stop = task.inner { + cb(Ok(TaskRes::Stop))?; + break; + } + if let Task::Pause = task.inner { + // Encountering `Pause`, indicate the disk may not slow anymore + let (cb, f) = paired_future_callback(); + self.sender.send_snapshot(cb); + let to_files = HedgedFileSystem::get_files(&self.path)?; + let from_files = block_on(f).unwrap().map(|res| { + if let TaskRes::Snapshot((seq, files)) = res { + snap_seq = Some(seq); + files + } else { + unreachable!() + } + })?; + + // Snapshot doesn't include the file size, so it would copy more data than + // the data seen at the time of snapshot. But it's okay, as the data is + // written with specific offset, so the data written + // of no necessity will be overwritten by the latter writes. + // Exclude rewrite files because rewrite files are always synced. + HedgedFileSystem::catch_up_diff(&self.fs, from_files, to_files, true)?; + + self.sender.finish_snapshot(); + self.seqno.store(snap_seq.unwrap(), Ordering::Relaxed); + last_seq = snap_seq.unwrap(); + continue; + } + if self.id == 1 { + fail_point!("hedged::task_runner::thread1"); + } + let seq = task.seq; + assert_ne!(seq, 0); + if let Some(snap) = snap_seq.as_ref() { + // the change already included in the snapshot + if seq < *snap { + continue; + } else if seq == *snap { + unreachable!(); + } else if seq == *snap + 1 { + snap_seq = None; + } else { + panic!("seqno {} is larger than snapshot seqno {}", seq, *snap); + } + } + + assert_eq!(last_seq + 1, seq); + last_seq = seq; + let res = task.process(&self.fs, &self.path); + // seqno should be updated before the write callback is called, otherwise one + // read may be performed right after the write is finished. Then the read may be + // performed on the other disk not having the data because the seqno for this + // disk is not updated yet. + self.seqno.store(seq, Ordering::Relaxed); + cb(res)?; + } + Ok(()) + } } // TODO: read both dir at recovery, maybe no need? cause operations are to both // disks TODO: consider encryption impl HedgedFileSystem { pub fn new(base: Arc, path1: PathBuf, path2: PathBuf) -> Self { - let (tx1, rx1) = unbounded::<(SeqTask, Callback)>(); - let (tx2, rx2) = unbounded::<(SeqTask, Callback)>(); + let (tx1, rx1) = unbounded::<(SeqTask, Callback)>(); + let (tx2, rx2) = unbounded::<(SeqTask, Callback)>(); let sender = HedgedSender::new(tx1, tx2); let seqno1 = Arc::new(AtomicU64::new(0)); @@ -916,30 +933,14 @@ impl FutureHandle { } } - fn get(&self, file_system: &DefaultFileSystem) -> Arc { + fn get(&self, file_system: &DefaultFileSystem) -> IoResult> { let mut set = false; let fd = match unsafe { &mut *self.inner.get() } { Either::Left(rx) => { set = true; - // TODO: should we handle the second disk io error match block_on(rx) { - Err(Canceled) => { - // Canceled is caused by the task is dropped when in paused state, - // so we should retry the task now - match self.task.as_ref().unwrap() { - Task::Create(path) => { - // has been already created, so just open - let fd = file_system.open(path, Permission::ReadWrite).unwrap(); // TODO: handle error - Arc::new(fd) - } - Task::Open { path, perm } => { - let fd = file_system.open(path, *perm).unwrap(); // TODO: handle error - Arc::new(fd) - } - _ => unreachable!(), - } - } - Ok(res) => match res.unwrap() { + Err(Canceled) => self.retry_canceled(file_system)?, + Ok(res) => match res? { TaskRes::Open { fd, .. } => Arc::new(fd), TaskRes::Create { fd, .. } => Arc::new(fd), _ => unreachable!(), @@ -953,21 +954,22 @@ impl FutureHandle { *self.inner.get() = Either::Right(fd.clone()); } } - fd + Ok(fd) } - fn try_get(&self) -> Option> { + fn try_get(&self, file_system: &DefaultFileSystem) -> IoResult>> { let mut set = false; let fd = match unsafe { &mut *self.inner.get() } { Either::Left(rx) => { set = true; - // TODO: should we handle the second disk io error - match rx.try_recv().unwrap() { - None => return None, - Some(Err(_)) => panic!(), - Some(Ok(TaskRes::Open { fd, .. })) => Arc::new(fd), - Some(Ok(TaskRes::Create { fd, .. })) => Arc::new(fd), - _ => unreachable!(), + match rx.try_recv() { + Err(Canceled) => self.retry_canceled(file_system)?, + Ok(None) => return Ok(None), + Ok(Some(res)) => match res? { + TaskRes::Open { fd, .. } => Arc::new(fd), + TaskRes::Create { fd, .. } => Arc::new(fd), + _ => unreachable!(), + }, } } Either::Right(w) => w.clone(), @@ -977,11 +979,31 @@ impl FutureHandle { *self.inner.get() = Either::Right(fd.clone()); } } - Some(fd) + Ok(Some(fd)) + } + + fn retry_canceled(&self, file_system: &DefaultFileSystem) -> IoResult> { + // Canceled is caused by the task is dropped when in paused state, + // so we should retry the task now + Ok(match self.task.as_ref().unwrap() { + Task::Create(path) => { + // has been already created, so just open + let fd = file_system.open(path, Permission::ReadWrite)?; + Arc::new(fd) + } + Task::Open { path, perm } => { + let fd = file_system.open(path, *perm)?; + Arc::new(fd) + } + _ => unreachable!(), + }) } } pub struct HedgedHandle { + base: Arc, + + // for rewrite file, all the operations should wait both disks finished strong_consistent: bool, sender: HedgedSender, @@ -1010,43 +1032,41 @@ impl HedgedHandle { let mut thread2 = None; if strong_consistent { // use two separated threads for both wait - let (tx1, rx1) = unbounded::<(SeqTask, Callback)>(); - let (tx2, rx2) = unbounded::<(SeqTask, Callback)>(); + let (tx1, rx1) = unbounded::<(SeqTask, Callback)>(); + let (tx2, rx2) = unbounded::<(SeqTask, Callback)>(); // replace the seqno with self owned, then in `read` the seqno from two disks // should be always the same. It's just to reuse the logic without // adding special check in `read` seqno1 = Arc::new(AtomicU64::new(0)); seqno2 = Arc::new(AtomicU64::new(0)); - let seqno1_clone = seqno1.clone(); - let fs1 = base.clone(); - thread1 = Some(thread::spawn(move || { - for (task, cb) in rx1 { + let poll = |(rx, seqno, fs): ( + Receiver<(SeqTask, Callback)>, + Arc, + Arc, + )| { + for (task, cb) in rx { if let Task::Stop = task.inner { break; } assert!(!matches!(task.inner, Task::Pause | Task::Snapshot)); - let res = task.handle_process(&fs1); - seqno1_clone.fetch_add(1, Ordering::Relaxed); - cb(res); + let res = task.handle_process(&fs); + seqno.fetch_add(1, Ordering::Relaxed); + cb(res).unwrap(); } + }; + let args1 = (rx1, seqno1.clone(), base.clone()); + thread1 = Some(thread::spawn(move || { + poll(args1); })); - let seqno2_clone = seqno2.clone(); - let fs2 = base; + let args2 = (rx2, seqno2.clone(), base.clone()); thread2 = Some(thread::spawn(move || { - for (task, cb) in rx2 { - if let Task::Stop = task.inner { - break; - } - assert!(!matches!(task.inner, Task::Pause | Task::Snapshot)); - let res = task.handle_process(&fs2); - seqno2_clone.fetch_add(1, Ordering::Relaxed); - cb(res); - } + poll(args2); })); sender = HedgedSender::new(tx1, tx2); } Self { + base, strong_consistent, sender, handle1: Arc::new(handle1), @@ -1085,16 +1105,20 @@ impl HedgedHandle { match seq1.cmp(&seq2) { std::cmp::Ordering::Equal => { // TODO: read simultaneously from both disks and return the faster one - if let Some(fd) = self.handle1.try_get() { + if let Some(fd) = self.handle1.try_get(&self.base)? { fd.read(offset, buf) - } else if let Some(fd) = self.handle2.try_get() { + } else if let Some(fd) = self.handle2.try_get(&self.base)? { fd.read(offset, buf) } else { panic!("Both fd1 and fd2 are None"); } } - std::cmp::Ordering::Greater => self.handle1.try_get().unwrap().read(offset, buf), - std::cmp::Ordering::Less => self.handle2.try_get().unwrap().read(offset, buf), + std::cmp::Ordering::Greater => { + self.handle1.try_get(&self.base)?.unwrap().read(offset, buf) + } + std::cmp::Ordering::Less => { + self.handle2.try_get(&self.base)?.unwrap().read(offset, buf) + } } } @@ -1212,7 +1236,7 @@ impl Drop for HedgedHandle { fn drop(&mut self) { if self.strong_consistent { self.sender - .send(Task::Stop, Task::Stop, Box::new(|_| {}), Box::new(|_| {})); + .send(Task::Stop, Task::Stop, empty_callback(), empty_callback()); self.thread1.take().unwrap().join().unwrap(); self.thread2.take().unwrap().join().unwrap(); } @@ -1300,15 +1324,3 @@ impl Read for HedgedReader { Ok(len) } } - -pub fn paired_future_callback() -> (Callback, oneshot::Receiver>) -{ - let (tx, future) = oneshot::channel(); - let callback = Box::new(move |result| { - let r = tx.send(result); - if r.is_err() { - warn!("paired_future_callback: Failed to send result to the future rx, discarded."); - } - }); - (callback, future) -} From ce601a9761695b39bcfa394bd8f576ff035ca2b3 Mon Sep 17 00:00:00 2001 From: Connor1996 Date: Thu, 14 Sep 2023 18:30:41 +0800 Subject: [PATCH 27/32] remove recover ext Signed-off-by: Connor1996 --- src/engine.rs | 4 +--- src/env/default.rs | 3 --- src/env/hedged.rs | 23 +++++------------------ src/env/mod.rs | 24 +++++------------------- src/env/obfuscated.rs | 3 --- 5 files changed, 11 insertions(+), 46 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index b85c0d7c..cccbf9b4 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -11,7 +11,7 @@ use log::{error, info}; use protobuf::{parse_from_bytes, Message}; use crate::consistency::ConsistencyChecker; -use crate::env::{DefaultFileSystem, FileSystem, RecoverExt}; +use crate::env::{DefaultFileSystem, FileSystem}; use crate::event_listener::EventListener; use crate::file_pipe_log::debug::LogItemReader; use crate::file_pipe_log::{DefaultMachineFactory, FilePipeLog, FilePipeLogBuilder}; @@ -2053,8 +2053,6 @@ pub(crate) mod tests { } } - impl RecoverExt for DeleteMonitoredFileSystem {} - impl FileSystem for DeleteMonitoredFileSystem { type Handle = ::Handle; type Reader = ::Reader; diff --git a/src/env/default.rs b/src/env/default.rs index 9b5084db..7de4c0de 100644 --- a/src/env/default.rs +++ b/src/env/default.rs @@ -5,7 +5,6 @@ use std::os::unix::io::RawFd; use std::path::Path; use std::sync::Arc; -use crate::env::RecoverExt; use fail::fail_point; use log::error; use nix::errno::Errno; @@ -275,8 +274,6 @@ impl WriteExt for LogFile { #[derive(Clone)] pub struct DefaultFileSystem; -impl RecoverExt for DefaultFileSystem {} - impl FileSystem for DefaultFileSystem { type Handle = LogFd; type Reader = LogFile; diff --git a/src/env/hedged.rs b/src/env/hedged.rs index aac23c59..f3644d9b 100644 --- a/src/env/hedged.rs +++ b/src/env/hedged.rs @@ -1,7 +1,6 @@ // Copyright (c) 2017-present, PingCAP, Inc. Licensed under Apache-2.0. use crate::env::default::LogFile; -use crate::env::RecoverExt; use crate::file_pipe_log::log_file::build_file_reader; use crate::file_pipe_log::pipe_builder::FileName; use crate::file_pipe_log::reader::LogItemBatchFileReader; @@ -798,7 +797,11 @@ impl Drop for HedgedFileSystem { } } -impl RecoverExt for HedgedFileSystem { +impl FileSystem for HedgedFileSystem { + type Handle = HedgedHandle; + type Reader = HedgedReader; + type Writer = HedgedWriter; + fn bootstrap(&self) -> IoResult<()> { // catch up diff if !self.path1.exists() { @@ -831,22 +834,6 @@ impl RecoverExt for HedgedFileSystem { Ok(()) } - fn need_recover(&self) -> bool { - false - } - - fn is_in_recover(&self) -> bool { - false - } - - fn trigger_recover(&self) {} -} - -impl FileSystem for HedgedFileSystem { - type Handle = HedgedHandle; - type Reader = HedgedReader; - type Writer = HedgedWriter; - fn create>(&self, path: P) -> IoResult { block_on(self.wait_handle( Task::Create(path.as_ref().to_path_buf()), diff --git a/src/env/mod.rs b/src/env/mod.rs index c7c76ac0..4916c6f2 100644 --- a/src/env/mod.rs +++ b/src/env/mod.rs @@ -21,11 +21,15 @@ pub enum Permission { } /// FileSystem -pub trait FileSystem: Send + Sync + 'static + RecoverExt { +pub trait FileSystem: Send + Sync + 'static { type Handle: Send + Sync + Handle; type Reader: Seek + Read + Send; type Writer: Seek + Write + Send + WriteExt; + fn bootstrap(&self) -> Result<()> { + Ok(()) + } + fn create>(&self, path: P) -> Result; fn open>(&self, path: P, perm: Permission) -> Result; @@ -67,24 +71,6 @@ pub trait FileSystem: Send + Sync + 'static + RecoverExt { fn new_writer(&self, handle: Arc) -> Result; } -pub trait RecoverExt { - fn bootstrap(&self) -> Result<()> { - Ok(()) - } - - fn need_recover(&self) -> bool { - false - } - - fn is_in_recover(&self) -> bool { - false - } - - fn trigger_recover(&self) { - () - } -} - pub trait Handle { fn truncate(&self, offset: usize) -> Result<()>; diff --git a/src/env/obfuscated.rs b/src/env/obfuscated.rs index d350f4df..6adaf277 100644 --- a/src/env/obfuscated.rs +++ b/src/env/obfuscated.rs @@ -5,7 +5,6 @@ use std::path::Path; use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::Arc; -use crate::env::RecoverExt; use crate::env::{DefaultFileSystem, FileSystem, Permission, WriteExt}; pub struct ObfuscatedReader(::Reader); @@ -86,8 +85,6 @@ impl ObfuscatedFileSystem { } } -impl RecoverExt for ObfuscatedFileSystem {} - impl FileSystem for ObfuscatedFileSystem { type Handle = ::Handle; type Reader = ObfuscatedReader; From 76b38bb8b483c6ac9ad26e3d369bb9fd4f76f7c5 Mon Sep 17 00:00:00 2001 From: Connor1996 Date: Thu, 14 Sep 2023 20:27:49 +0800 Subject: [PATCH 28/32] separate into multiple files Signed-off-by: Connor1996 --- src/env/hedged.rs | 1313 ------------------------------------- src/env/hedged/mod.rs | 602 +++++++++++++++++ src/env/hedged/recover.rs | 255 +++++++ src/env/hedged/runner.rs | 125 ++++ src/env/hedged/sender.rs | 140 ++++ src/env/hedged/task.rs | 249 +++++++ src/env/hedged/util.rs | 9 + 7 files changed, 1380 insertions(+), 1313 deletions(-) delete mode 100644 src/env/hedged.rs create mode 100644 src/env/hedged/mod.rs create mode 100644 src/env/hedged/recover.rs create mode 100644 src/env/hedged/runner.rs create mode 100644 src/env/hedged/sender.rs create mode 100644 src/env/hedged/task.rs create mode 100644 src/env/hedged/util.rs diff --git a/src/env/hedged.rs b/src/env/hedged.rs deleted file mode 100644 index f3644d9b..00000000 --- a/src/env/hedged.rs +++ /dev/null @@ -1,1313 +0,0 @@ -// Copyright (c) 2017-present, PingCAP, Inc. Licensed under Apache-2.0. - -use crate::env::default::LogFile; -use crate::file_pipe_log::log_file::build_file_reader; -use crate::file_pipe_log::pipe_builder::FileName; -use crate::file_pipe_log::reader::LogItemBatchFileReader; -use crate::file_pipe_log::FileNameExt; -use crate::internals::parse_reserved_file_name; -use crate::internals::FileId; -use crate::internals::LogQueue; -use crate::Error; -use crossbeam::channel::unbounded; -use crossbeam::channel::{Receiver, Sender}; -use fail::fail_point; -use log::info; -use std::cell::UnsafeCell; -use std::fs; -use std::io::{Read, Result as IoResult, Seek, SeekFrom, Write}; -use std::path::Path; -use std::path::PathBuf; -use std::sync::atomic::AtomicU64; -use std::sync::atomic::Ordering; -use std::sync::Arc; -use std::sync::Mutex; -use std::thread; -use std::thread::JoinHandle; - -use crate::env::default::LogFd; -use crate::env::DefaultFileSystem; -use crate::env::{FileSystem, Handle, Permission, WriteExt}; -use futures::channel::oneshot::{self, Canceled}; -use futures::executor::block_on; -use futures::{join, select}; - -use either::Either; - -type Callback = Box) -> IoResult + Send>; - -fn empty_callback() -> Callback { - Box::new(|_| Ok(TaskRes::Noop)) -} - -fn paired_future_callback() -> (Callback, oneshot::Receiver>) { - let (tx, future) = oneshot::channel(); - let callback = Box::new(move |result| -> IoResult { - if let Err(result) = tx.send(result) { - return result; - } - Ok(TaskRes::Noop) - }); - (callback, future) -} - -// TODO: add metrics -// TODO: handle specially on config change(upgrade and downgrade) -// TODO: remove recover ext -// TODO: add comment and rename - -struct SeqTask { - inner: Task, - seq: u64, -} - -#[derive(Clone)] -enum Task { - Create(PathBuf), - Open { - path: PathBuf, - perm: Permission, - }, - Delete(PathBuf), - Rename { - src_path: PathBuf, - dst_path: PathBuf, - }, - Truncate { - handle: Arc, - offset: usize, - }, - FileSize(Arc), - Sync(Arc), - Write { - handle: Arc, - offset: usize, - bytes: Vec, - }, - Allocate { - handle: Arc, - offset: usize, - size: usize, - }, - Pause, - Snapshot, - Stop, -} - -impl SeqTask { - fn process(self, file_system: &DefaultFileSystem, path: &PathBuf) -> IoResult { - match self.inner { - Task::Create(path) => file_system.create(&path).map(|h| TaskRes::Create { - fd: h, - is_for_rewrite: path.extension().map_or(false, |ext| ext == "rewrite"), - }), - Task::Open { path, perm } => file_system.open(&path, perm).map(|h| TaskRes::Open { - fd: h, - is_for_rewrite: path.extension().map_or(false, |ext| ext == "rewrite"), - }), - Task::Delete(path) => file_system.delete(path).map(|_| TaskRes::Delete), - Task::Rename { src_path, dst_path } => file_system - .rename(src_path, dst_path) - .map(|_| TaskRes::Rename), - Task::Snapshot => { - let mut files = HedgedFileSystem::get_files(&path)?; - files.append_files = files - .append_files - .into_iter() - .map(|f| f.into_handle(file_system)) - .collect(); - // exclude rewrite files, as they are always synced - files.reserved_files = files - .reserved_files - .into_iter() - .map(|f| f.into_handle(file_system)) - .collect(); - Ok(TaskRes::Snapshot((self.seq, files))) - } - Task::Stop | Task::Pause => unreachable!(), - _ => self.handle_process(file_system), - } - } - - fn handle_process(self, file_system: &DefaultFileSystem) -> IoResult { - match self.inner { - Task::Truncate { handle, offset } => handle - .get(file_system)? - .truncate(offset) - .map(|_| TaskRes::Truncate), - Task::FileSize(handle) => handle - .get(file_system)? - .file_size() - .map(|s| TaskRes::FileSize(s)), - Task::Sync(handle) => handle.get(file_system)?.sync().map(|_| TaskRes::Sync), - Task::Write { - handle, - offset, - bytes, - } => handle - .get(file_system)? - .write(offset, &bytes) - .map(|s| TaskRes::Write(s)), - Task::Allocate { - handle, - offset, - size, - } => handle - .get(file_system)? - .allocate(offset, size) - .map(|_| TaskRes::Allocate), - _ => unreachable!(), - } - } -} - -enum TaskRes { - Noop, - Create { fd: LogFd, is_for_rewrite: bool }, - Open { fd: LogFd, is_for_rewrite: bool }, - Delete, - Rename, - Truncate, - FileSize(usize), - Sync, - Write(usize), - Allocate, - Snapshot((u64, Files)), - Stop, -} - -#[derive(Default)] -struct Files { - prefix: PathBuf, - append_files: Vec, - rewrite_files: Vec, - reserved_files: Vec, -} - -enum SeqFile { - Path(FileName), - Handle((FileName, Arc)), -} - -impl SeqFile { - fn seq(&self) -> u64 { - match self { - SeqFile::Path(f) => f.seq, - SeqFile::Handle((f, _)) => f.seq, - } - } - - fn path(&self) -> &PathBuf { - match self { - SeqFile::Path(f) => &f.path, - SeqFile::Handle((f, _)) => &f.path, - } - } - - fn remove(&self) -> IoResult<()> { - match self { - SeqFile::Path(f) => fs::remove_file(&f.path), - SeqFile::Handle((f, _)) => fs::remove_file(&f.path), - } - } - - fn copy(&self, file_system: &DefaultFileSystem, to: &PathBuf) -> IoResult { - match self { - SeqFile::Path(f) => fs::copy(&f.path, to.as_path()), - SeqFile::Handle((_, fd)) => { - let mut reader = LogFile::new(fd.clone()); - let mut writer = LogFile::new(Arc::new(file_system.create(to)?)); - std::io::copy(&mut reader, &mut writer) - } - } - } - - fn into_handle(mut self, file_system: &DefaultFileSystem) -> Self { - if let SeqFile::Path(f) = self { - let fd = Arc::new(file_system.open(&f.path, Permission::ReadOnly).unwrap()); - self = SeqFile::Handle((f, fd)); - } - self - } -} - -fn replace_path(path: &Path, from: &Path, to: &Path) -> PathBuf { - if let Ok(file) = path.strip_prefix(from) { - to.to_path_buf().join(file) - } else { - panic!("Invalid path: {:?}", path); - } -} - -// let say the average entry size is 100B, then the total size of the log in the -// channel is 1GB, -const PAUSE_THRESHOLD: usize = 10000; - -fn get_pause_threshold() -> usize { - fail_point!("hedged::pause_threshold", |s| s - .unwrap() - .parse::() - .unwrap()); - PAUSE_THRESHOLD -} - -// Make sure the task is sent to two disks' channel atomically, otherwise the -// ordering of the tasks in two disks' channels are not same. -#[derive(Clone)] -struct HedgedSender(Arc>); - -struct HedgedSenderInner { - disk1: Sender<(SeqTask, Callback)>, - disk2: Sender<(SeqTask, Callback)>, - seq: u64, - state: State, -} - -impl HedgedSender { - fn new(disk1: Sender<(SeqTask, Callback)>, disk2: Sender<(SeqTask, Callback)>) -> Self { - Self(Arc::new(Mutex::new(HedgedSenderInner { - disk1, - disk2, - seq: 0, - state: State::Normal, - }))) - } - - fn state(&self) -> State { - self.0.lock().unwrap().state.clone() - } - - fn send(&self, task1: Task, task2: Task, cb1: Callback, cb2: Callback) { - if matches!(task1, Task::Pause | Task::Snapshot) { - unreachable!(); - } - - let mut inner = self.0.lock().unwrap(); - inner.seq += 1; - let task1 = SeqTask { - inner: task1, - seq: inner.seq, - }; - let task2 = SeqTask { - inner: task2, - seq: inner.seq, - }; - if matches!(inner.state, State::Normal) { - let check1 = inner.disk1.len() > get_pause_threshold(); - let check2 = inner.disk2.len() > get_pause_threshold(); - match (check1, check2) { - (true, true) => { - panic!("Both channels of disk1 and disk2 are full") - } - (true, false) => { - inner.state = State::Paused1; - inner - .disk1 - .send(( - SeqTask { - inner: Task::Pause, - seq: 0, - }, - empty_callback(), - )) - .unwrap(); - } - (false, true) => { - inner.state = State::Paused2; - inner - .disk2 - .send(( - SeqTask { - inner: Task::Pause, - seq: 0, - }, - empty_callback(), - )) - .unwrap(); - } - _ => {} - } - } - if !matches!(inner.state, State::Paused1) { - inner.disk1.send((task1, cb1)).unwrap(); - } - if !matches!(inner.state, State::Paused2) { - inner.disk2.send((task2, cb2)).unwrap(); - } - } - - fn send_snapshot(&self, cb: Callback) { - let mut inner = self.0.lock().unwrap(); - inner.seq += 1; - let task = SeqTask { - inner: Task::Snapshot, - seq: inner.seq, - }; - match inner.state { - State::Paused1 => { - inner.disk2.send((task, cb)).unwrap(); - } - State::Paused2 => { - inner.disk1.send((task, cb)).unwrap(); - } - _ => unreachable!(), - } - inner.state = State::Recovering; - } - - fn finish_snapshot(&self) { - let mut inner = self.0.lock().unwrap(); - inner.state = State::Normal; - } -} - -#[derive(Debug, PartialEq, Clone)] -pub enum State { - Normal, - Paused1, /* When the length of channel of disk1 reaches threshold, a - * `Pause` task is sent and no more later task will be sent - * to disk1 */ - Paused2, // no more task will be sent to disk2 - Recovering, -} - -pub struct HedgedFileSystem { - base: Arc, - - path1: PathBuf, - path2: PathBuf, - - sender: HedgedSender, - - seqno1: Arc, - seqno2: Arc, - - thread1: Option>, - thread2: Option>, -} - -struct TaskRunner { - id: u8, - path: PathBuf, - fs: Arc, - rx: Receiver<(SeqTask, Callback)>, - sender: HedgedSender, - seqno: Arc, -} - -impl TaskRunner { - fn new( - id: u8, - path: PathBuf, - fs: Arc, - rx: Receiver<(SeqTask, Callback)>, - sender: HedgedSender, - seqno: Arc, - ) -> Self { - Self { - id, - path, - fs, - rx, - sender, - seqno, - } - } - - fn spawn(self) -> JoinHandle<()> { - let id = self.id; - thread::Builder::new() - .name(format!("raft-engine-disk{}", id)) - .spawn(move || { - if let Err(e) = self.poll() { - panic!("disk {} failed: {:?}", id, e); - } - }) - .unwrap() - } - - fn poll(self) -> IoResult<()> { - let mut last_seq = 0; - let mut snap_seq = None; - for (task, cb) in self.rx { - if let Task::Stop = task.inner { - cb(Ok(TaskRes::Stop))?; - break; - } - if let Task::Pause = task.inner { - // Encountering `Pause`, indicate the disk may not slow anymore - let (cb, f) = paired_future_callback(); - self.sender.send_snapshot(cb); - let to_files = HedgedFileSystem::get_files(&self.path)?; - let from_files = block_on(f).unwrap().map(|res| { - if let TaskRes::Snapshot((seq, files)) = res { - snap_seq = Some(seq); - files - } else { - unreachable!() - } - })?; - - // Snapshot doesn't include the file size, so it would copy more data than - // the data seen at the time of snapshot. But it's okay, as the data is - // written with specific offset, so the data written - // of no necessity will be overwritten by the latter writes. - // Exclude rewrite files because rewrite files are always synced. - HedgedFileSystem::catch_up_diff(&self.fs, from_files, to_files, true)?; - - self.sender.finish_snapshot(); - self.seqno.store(snap_seq.unwrap(), Ordering::Relaxed); - last_seq = snap_seq.unwrap(); - continue; - } - if self.id == 1 { - fail_point!("hedged::task_runner::thread1"); - } - let seq = task.seq; - assert_ne!(seq, 0); - if let Some(snap) = snap_seq.as_ref() { - // the change already included in the snapshot - if seq < *snap { - continue; - } else if seq == *snap { - unreachable!(); - } else if seq == *snap + 1 { - snap_seq = None; - } else { - panic!("seqno {} is larger than snapshot seqno {}", seq, *snap); - } - } - - assert_eq!(last_seq + 1, seq); - last_seq = seq; - let res = task.process(&self.fs, &self.path); - // seqno should be updated before the write callback is called, otherwise one - // read may be performed right after the write is finished. Then the read may be - // performed on the other disk not having the data because the seqno for this - // disk is not updated yet. - self.seqno.store(seq, Ordering::Relaxed); - cb(res)?; - } - Ok(()) - } -} - -// TODO: read both dir at recovery, maybe no need? cause operations are to both -// disks TODO: consider encryption -impl HedgedFileSystem { - pub fn new(base: Arc, path1: PathBuf, path2: PathBuf) -> Self { - let (tx1, rx1) = unbounded::<(SeqTask, Callback)>(); - let (tx2, rx2) = unbounded::<(SeqTask, Callback)>(); - let sender = HedgedSender::new(tx1, tx2); - - let seqno1 = Arc::new(AtomicU64::new(0)); - let seqno2 = Arc::new(AtomicU64::new(0)); - let runner1 = TaskRunner::new( - 1, - path1.clone(), - base.clone(), - rx1, - sender.clone(), - seqno1.clone(), - ); - let runner2 = TaskRunner::new( - 2, - path2.clone(), - base.clone(), - rx2, - sender.clone(), - seqno2.clone(), - ); - let thread1 = runner1.spawn(); - let thread2 = runner2.spawn(); - Self { - base, - path1, - path2, - sender, - seqno1, - seqno2, - thread1: Some(thread1), - thread2: Some(thread2), - } - } - - pub fn state(&self) -> State { - self.sender.state() - } - - fn catch_up_diff( - fs: &Arc, - mut from_files: Files, - mut to_files: Files, - skip_rewrite: bool, - ) -> IoResult<()> { - from_files - .append_files - .sort_by(|a, b| a.seq().cmp(&b.seq())); - to_files.append_files.sort_by(|a, b| a.seq().cmp(&b.seq())); - from_files - .rewrite_files - .sort_by(|a, b| a.seq().cmp(&b.seq())); - to_files.rewrite_files.sort_by(|a, b| a.seq().cmp(&b.seq())); - from_files - .reserved_files - .sort_by(|a, b| a.seq().cmp(&b.seq())); - to_files - .reserved_files - .sort_by(|a, b| a.seq().cmp(&b.seq())); - - let check_files = |from: &Vec, to: &Vec| -> IoResult<()> { - let last_from_seq = from.last().map(|f| f.seq()).unwrap_or(0); - - let mut iter1 = from.iter().peekable(); - let mut iter2 = to.iter().peekable(); - // compare files of from and to, if the file in from is not in to, copy it to - // to, and if the file in to is not in from, delete it - loop { - match (iter1.peek(), iter2.peek()) { - (None, None) => break, - (Some(f1), None) => { - let to = replace_path( - f1.path().as_ref(), - from_files.prefix.as_ref(), - to_files.prefix.as_ref(), - ); - f1.copy(fs, &to)?; - iter1.next(); - } - (None, Some(f2)) => { - f2.remove()?; - iter2.next(); - } - (Some(f1), Some(f2)) => { - match f1.seq().cmp(&f2.seq()) { - std::cmp::Ordering::Equal => { - // check file size is not enough, treat the last files differently - // considering the recycle, always copy the last file - // TODO: only copy diff part - if f1.seq() == last_from_seq { - let to = replace_path( - f1.path().as_ref(), - from_files.prefix.as_ref(), - to_files.prefix.as_ref(), - ); - f1.copy(fs, &to)?; - } - iter1.next(); - iter2.next(); - } - std::cmp::Ordering::Less => { - let to = replace_path( - f1.path().as_ref(), - from_files.prefix.as_ref(), - to_files.prefix.as_ref(), - ); - f1.copy(fs, &to)?; - iter1.next(); - } - std::cmp::Ordering::Greater => { - f2.remove()?; - iter2.next(); - } - } - } - } - } - Ok(()) - }; - - check_files(&from_files.append_files, &to_files.append_files)?; - if !skip_rewrite { - check_files(&from_files.rewrite_files, &to_files.rewrite_files)?; - } - check_files(&from_files.reserved_files, &to_files.reserved_files)?; - Ok(()) - } - - fn get_files(path: &PathBuf) -> IoResult { - assert!(path.exists()); - - let mut files = Files { - prefix: path.clone(), - ..Default::default() - }; - - fs::read_dir(path) - .unwrap() - .try_for_each(|e| -> IoResult<()> { - let dir_entry = e?; - let p = dir_entry.path(); - if !p.is_file() { - return Ok(()); - } - let file_name = p.file_name().unwrap().to_str().unwrap(); - match FileId::parse_file_name(file_name) { - Some(FileId { - queue: LogQueue::Append, - seq, - }) => files.append_files.push(SeqFile::Path(FileName { - seq, - path: p, - path_id: 0, - })), - Some(FileId { - queue: LogQueue::Rewrite, - seq, - }) => files.rewrite_files.push(SeqFile::Path(FileName { - seq, - path: p, - path_id: 0, - })), - _ => { - if let Some(seq) = parse_reserved_file_name(file_name) { - files.reserved_files.push(SeqFile::Path(FileName { - seq, - path: p, - path_id: 0, - })) - } - } - } - Ok(()) - }) - .unwrap(); - - Ok(files) - } - - fn get_latest_valid_seq(&self, files: &Files) -> IoResult { - let mut count = 0; - if let Some(f) = files.append_files.last() { - let recovery_read_block_size = 1024; - let mut reader = LogItemBatchFileReader::new(recovery_read_block_size); - let handle = Arc::new(self.base.open(&f.path(), Permission::ReadOnly)?); - let file_reader = build_file_reader(self.base.as_ref(), handle)?; - match reader.open( - FileId { - queue: LogQueue::Append, - seq: f.seq(), - }, - file_reader, - ) { - Err(e) => match e { - Error::Io(err) => return Err(err), - _ => return Ok(0), - }, - Ok(_) => { - // Do nothing - } - } - loop { - match reader.next() { - Ok(Some(_)) => { - count += 1; - } - Ok(None) => break, - Err(_) => break, - } - } - } - - Ok(count) - } - - async fn wait_handle(&self, task1: Task, task2: Task) -> IoResult { - let (cb1, mut f1) = paired_future_callback(); - let (cb2, mut f2) = paired_future_callback(); - self.sender.send(task1.clone(), task2.clone(), cb1, cb2); - - let resolve = |res: TaskRes| -> (LogFd, bool) { - match res { - TaskRes::Create { fd, is_for_rewrite } => (fd, is_for_rewrite), - TaskRes::Open { fd, is_for_rewrite } => (fd, is_for_rewrite), - _ => unreachable!(), - } - }; - select! { - res1 = f1 => res1.unwrap().map(|res| { - let (fd, is_for_rewrite) = resolve(res); - HedgedHandle::new( - self.base.clone(), - is_for_rewrite, - self.sender.clone(), - FutureHandle::new_owned(fd), - FutureHandle::new(f2, task2), - self.seqno1.clone(), - self.seqno2.clone(), - ) - }), - res2 = f2 => res2.unwrap().map(|res| { - let (fd, is_for_rewrite) = resolve(res); - HedgedHandle::new( - self.base.clone(), - is_for_rewrite, - self.sender.clone(), - FutureHandle::new(f1, task1), - FutureHandle::new_owned(fd) , - self.seqno1.clone(), - self.seqno2.clone(), - ) - }), - } - } - - async fn wait(&self, task1: Task, task2: Task) -> IoResult<()> { - let (cb1, mut f1) = paired_future_callback(); - let (cb2, mut f2) = paired_future_callback(); - self.sender.send(task1, task2, cb1, cb2); - - select! { - res1 = f1 => res1.unwrap().map(|_| ()), - res2 = f2 => res2.unwrap().map(|_| ()), - } - } -} - -impl Drop for HedgedFileSystem { - fn drop(&mut self) { - block_on(self.wait(Task::Stop, Task::Stop)).unwrap(); - - let t1 = self.thread1.take().unwrap(); - let t2 = self.thread2.take().unwrap(); - let mut times = 0; - loop { - // wait 1s - if t1.is_finished() && t2.is_finished() { - t1.join().unwrap(); - t2.join().unwrap(); - break; - } - times += 1; - if times > 100 { - // one disk may be blocked for a long time, - // to avoid block shutdown process for a long time, do not join the threads - // here, only need at least to ensure one thread is exited - if t1.is_finished() || t2.is_finished() { - if t1.is_finished() { - t1.join().unwrap(); - } else { - t2.join().unwrap(); - } - break; - } - } - std::thread::sleep(std::time::Duration::from_millis(10)); - } - } -} - -impl FileSystem for HedgedFileSystem { - type Handle = HedgedHandle; - type Reader = HedgedReader; - type Writer = HedgedWriter; - - fn bootstrap(&self) -> IoResult<()> { - // catch up diff - if !self.path1.exists() { - info!("Create raft log directory: {}", self.path1.display()); - fs::create_dir(&self.path1).unwrap(); - } - if !self.path2.exists() { - info!("Create raft log directory: {}", self.path2.display()); - fs::create_dir(&self.path2).unwrap(); - } - let files1 = HedgedFileSystem::get_files(&self.path1)?; - let files2 = HedgedFileSystem::get_files(&self.path2)?; - - let count1 = self.get_latest_valid_seq(&files1)?; - let count2 = self.get_latest_valid_seq(&files2)?; - - match count1.cmp(&count2) { - std::cmp::Ordering::Equal => { - // still need to catch up, but only diff - HedgedFileSystem::catch_up_diff(&self.base, files1, files2, false)?; - return Ok(()); - } - std::cmp::Ordering::Less => { - HedgedFileSystem::catch_up_diff(&self.base, files2, files1, false)?; - } - std::cmp::Ordering::Greater => { - HedgedFileSystem::catch_up_diff(&self.base, files1, files2, false)?; - } - } - Ok(()) - } - - fn create>(&self, path: P) -> IoResult { - block_on(self.wait_handle( - Task::Create(path.as_ref().to_path_buf()), - Task::Create(replace_path( - path.as_ref(), - self.path1.as_ref(), - self.path2.as_ref(), - )), - )) - } - - fn open>(&self, path: P, perm: Permission) -> IoResult { - block_on(self.wait_handle( - Task::Open { - path: path.as_ref().to_path_buf(), - perm, - }, - Task::Open { - path: replace_path(path.as_ref(), self.path1.as_ref(), self.path2.as_ref()), - perm, - }, - )) - } - - fn delete>(&self, path: P) -> IoResult<()> { - block_on(self.wait( - Task::Delete(path.as_ref().to_path_buf()), - Task::Delete(replace_path( - path.as_ref(), - self.path1.as_ref(), - self.path2.as_ref(), - )), - )) - } - - fn rename>(&self, src_path: P, dst_path: P) -> IoResult<()> { - block_on(self.wait( - Task::Rename { - src_path: src_path.as_ref().to_path_buf(), - dst_path: dst_path.as_ref().to_path_buf(), - }, - Task::Rename { - src_path: replace_path(src_path.as_ref(), self.path1.as_ref(), self.path2.as_ref()), - dst_path: replace_path(dst_path.as_ref(), self.path1.as_ref(), self.path2.as_ref()), - }, - )) - } - - fn new_reader(&self, handle: Arc) -> IoResult { - Ok(HedgedReader::new(handle)) - } - - fn new_writer(&self, handle: Arc) -> IoResult { - Ok(HedgedWriter::new(handle)) - } -} - -pub struct FutureHandle { - inner: UnsafeCell>, Arc>>, - task: Option, -} - -unsafe impl Send for FutureHandle {} - -// To avoid using `Mutex` -// Safety: -// For write, all writes are serialized to one channel, so only one thread will -// update the inner. For read, multiple readers and one writer and may visit -// try_get() concurrently to get the fd from receiver. The receiver is `Sync`, -// so only one of them will get the fd, and update the inner to Arc. -unsafe impl Sync for FutureHandle {} - -impl FutureHandle { - fn new(rx: oneshot::Receiver>, task: Task) -> Self { - Self { - inner: UnsafeCell::new(Either::Left(rx)), - task: Some(task), - } - } - fn new_owned(h: LogFd) -> Self { - Self { - inner: UnsafeCell::new(Either::Right(Arc::new(h))), - task: None, - } - } - - fn get(&self, file_system: &DefaultFileSystem) -> IoResult> { - let mut set = false; - let fd = match unsafe { &mut *self.inner.get() } { - Either::Left(rx) => { - set = true; - match block_on(rx) { - Err(Canceled) => self.retry_canceled(file_system)?, - Ok(res) => match res? { - TaskRes::Open { fd, .. } => Arc::new(fd), - TaskRes::Create { fd, .. } => Arc::new(fd), - _ => unreachable!(), - }, - } - } - Either::Right(w) => w.clone(), - }; - if set { - unsafe { - *self.inner.get() = Either::Right(fd.clone()); - } - } - Ok(fd) - } - - fn try_get(&self, file_system: &DefaultFileSystem) -> IoResult>> { - let mut set = false; - let fd = match unsafe { &mut *self.inner.get() } { - Either::Left(rx) => { - set = true; - match rx.try_recv() { - Err(Canceled) => self.retry_canceled(file_system)?, - Ok(None) => return Ok(None), - Ok(Some(res)) => match res? { - TaskRes::Open { fd, .. } => Arc::new(fd), - TaskRes::Create { fd, .. } => Arc::new(fd), - _ => unreachable!(), - }, - } - } - Either::Right(w) => w.clone(), - }; - if set { - unsafe { - *self.inner.get() = Either::Right(fd.clone()); - } - } - Ok(Some(fd)) - } - - fn retry_canceled(&self, file_system: &DefaultFileSystem) -> IoResult> { - // Canceled is caused by the task is dropped when in paused state, - // so we should retry the task now - Ok(match self.task.as_ref().unwrap() { - Task::Create(path) => { - // has been already created, so just open - let fd = file_system.open(path, Permission::ReadWrite)?; - Arc::new(fd) - } - Task::Open { path, perm } => { - let fd = file_system.open(path, *perm)?; - Arc::new(fd) - } - _ => unreachable!(), - }) - } -} - -pub struct HedgedHandle { - base: Arc, - - // for rewrite file, all the operations should wait both disks finished - strong_consistent: bool, - - sender: HedgedSender, - - handle1: Arc, - handle2: Arc, - - seqno1: Arc, - seqno2: Arc, - - thread1: Option>, - thread2: Option>, -} - -impl HedgedHandle { - fn new( - base: Arc, - strong_consistent: bool, - mut sender: HedgedSender, - handle1: FutureHandle, - handle2: FutureHandle, - mut seqno1: Arc, - mut seqno2: Arc, - ) -> Self { - let mut thread1 = None; - let mut thread2 = None; - if strong_consistent { - // use two separated threads for both wait - let (tx1, rx1) = unbounded::<(SeqTask, Callback)>(); - let (tx2, rx2) = unbounded::<(SeqTask, Callback)>(); - // replace the seqno with self owned, then in `read` the seqno from two disks - // should be always the same. It's just to reuse the logic without - // adding special check in `read` - seqno1 = Arc::new(AtomicU64::new(0)); - seqno2 = Arc::new(AtomicU64::new(0)); - let poll = |(rx, seqno, fs): ( - Receiver<(SeqTask, Callback)>, - Arc, - Arc, - )| { - for (task, cb) in rx { - if let Task::Stop = task.inner { - break; - } - assert!(!matches!(task.inner, Task::Pause | Task::Snapshot)); - let res = task.handle_process(&fs); - seqno.fetch_add(1, Ordering::Relaxed); - cb(res).unwrap(); - } - }; - let args1 = (rx1, seqno1.clone(), base.clone()); - thread1 = Some(thread::spawn(move || { - poll(args1); - })); - let args2 = (rx2, seqno2.clone(), base.clone()); - thread2 = Some(thread::spawn(move || { - poll(args2); - })); - sender = HedgedSender::new(tx1, tx2); - } - - Self { - base, - strong_consistent, - sender, - handle1: Arc::new(handle1), - handle2: Arc::new(handle2), - seqno1, - seqno2, - thread1, - thread2, - } - } - - fn read(&self, offset: usize, buf: &mut [u8]) -> IoResult { - // Raft engine promises that the offset would be read only after the write is - // finished and memtable is updated. And the hedged file system promises that - // the write is done when either one of the disk finishes the write. Here the - // read data must be present in at least one of the disks. So choose the disk of - // largest seqno to read. - // - // Safety: the get for these two seqno is not necessary to be atomic. - // What if the seqno2 is updated after getting seqno1? It's fine, let's say - // - T1 denotes the time of getting seqno1, the actual seqno for disk1 and disk2 - // is S1, S2 - // - T2 denotes the time of getting seqno2, the actual seqno for disk1 and disk2 - // is S1', S2' - // Assume disk2 is just slightly slower than disk1, here is a possible case: - // - T1: S1 = 10, S2 = 9 - // - T2: S1'= 12, S2'= 11 - // Then, what we get would be seq1=10, seq2=11, and the read would be performed - // on disk2. But disk2 is slower than disk1. The data may not be written yet. - // Would the read on a slower disk is safe? - // Yes, it's safe because at T1 we know the data can be read at least with a - // seqno of S1, then at T2, S2' > S1, so the data must be already written in the - // disk2, even if it's the slow disk. - let seq1 = self.seqno1.load(Ordering::Relaxed); - let seq2 = self.seqno2.load(Ordering::Relaxed); - match seq1.cmp(&seq2) { - std::cmp::Ordering::Equal => { - // TODO: read simultaneously from both disks and return the faster one - if let Some(fd) = self.handle1.try_get(&self.base)? { - fd.read(offset, buf) - } else if let Some(fd) = self.handle2.try_get(&self.base)? { - fd.read(offset, buf) - } else { - panic!("Both fd1 and fd2 are None"); - } - } - std::cmp::Ordering::Greater => { - self.handle1.try_get(&self.base)?.unwrap().read(offset, buf) - } - std::cmp::Ordering::Less => { - self.handle2.try_get(&self.base)?.unwrap().read(offset, buf) - } - } - } - - fn write(&self, offset: usize, content: &[u8]) -> IoResult { - block_on(self.wait( - Task::Write { - handle: self.handle1.clone(), - offset, - bytes: content.to_vec(), - }, - Task::Write { - handle: self.handle2.clone(), - offset, - bytes: content.to_vec(), - }, - )) - .map(|res| { - if let TaskRes::Write(size) = res { - size - } else { - unreachable!() - } - }) - } - - fn allocate(&self, offset: usize, size: usize) -> IoResult<()> { - block_on(self.wait( - Task::Allocate { - handle: self.handle1.clone(), - offset, - size, - }, - Task::Allocate { - handle: self.handle2.clone(), - offset, - size, - }, - )) - .map(|_| ()) - } - - async fn wait_one(&self, task1: Task, task2: Task) -> IoResult { - let (cb1, mut f1) = paired_future_callback(); - let (cb2, mut f2) = paired_future_callback(); - self.sender.send(task1, task2, cb1, cb2); - - select! { - res1 = f1 => res1.unwrap(), - res2 = f2 => res2.unwrap(), - } - } - - async fn wait_both(&self, task1: Task, task2: Task) -> IoResult { - let (cb1, f1) = paired_future_callback(); - let (cb2, f2) = paired_future_callback(); - self.sender.send(task1, task2, cb1, cb2); - - let (res1, res2) = join!(f1, f2); - match (res1.unwrap(), res2.unwrap()) { - (res @ Ok(_), Ok(_)) => res, - (Err(e), Err(_)) => Err(e), - (Err(e), _) => Err(e), - (_, Err(e)) => Err(e), - } - } - - async fn wait(&self, task1: Task, task2: Task) -> IoResult { - if self.strong_consistent { - self.wait_both(task1, task2).await - } else { - self.wait_one(task1, task2).await - } - } -} - -impl Handle for HedgedHandle { - fn truncate(&self, offset: usize) -> IoResult<()> { - block_on(self.wait( - Task::Truncate { - handle: self.handle1.clone(), - offset, - }, - Task::Truncate { - handle: self.handle2.clone(), - offset, - }, - )) - .map(|_| ()) - } - - fn file_size(&self) -> IoResult { - block_on(self.wait( - Task::FileSize(self.handle1.clone()), - Task::FileSize(self.handle2.clone()), - )) - .map(|res| { - if let TaskRes::FileSize(size) = res { - size - } else { - unreachable!() - } - }) - } - - fn sync(&self) -> IoResult<()> { - block_on(self.wait( - Task::Sync(self.handle1.clone()), - Task::Sync(self.handle2.clone()), - )) - .map(|_| ()) - } -} - -impl Drop for HedgedHandle { - fn drop(&mut self) { - if self.strong_consistent { - self.sender - .send(Task::Stop, Task::Stop, empty_callback(), empty_callback()); - self.thread1.take().unwrap().join().unwrap(); - self.thread2.take().unwrap().join().unwrap(); - } - } -} - -pub struct HedgedWriter { - inner: Arc, - offset: usize, -} - -impl HedgedWriter { - pub fn new(handle: Arc) -> Self { - Self { - inner: handle, - offset: 0, - } - } -} - -impl Write for HedgedWriter { - fn write(&mut self, buf: &[u8]) -> IoResult { - let len = self.inner.write(self.offset, buf)?; - self.offset += len; - Ok(len) - } - - fn flush(&mut self) -> IoResult<()> { - Ok(()) - } -} - -impl WriteExt for HedgedWriter { - fn truncate(&mut self, offset: usize) -> IoResult<()> { - self.inner.truncate(offset)?; - self.offset = offset; - Ok(()) - } - - fn allocate(&mut self, offset: usize, size: usize) -> IoResult<()> { - self.inner.allocate(offset, size) - } -} - -impl Seek for HedgedWriter { - fn seek(&mut self, pos: SeekFrom) -> IoResult { - match pos { - SeekFrom::Start(offset) => self.offset = offset as usize, - SeekFrom::Current(i) => self.offset = (self.offset as i64 + i) as usize, - SeekFrom::End(i) => self.offset = (self.inner.file_size()? as i64 + i) as usize, - } - Ok(self.offset as u64) - } -} - -pub struct HedgedReader { - inner: Arc, - offset: usize, -} - -impl HedgedReader { - pub fn new(handle: Arc) -> Self { - Self { - inner: handle, - offset: 0, - } - } -} - -impl Seek for HedgedReader { - fn seek(&mut self, pos: SeekFrom) -> IoResult { - match pos { - SeekFrom::Start(offset) => self.offset = offset as usize, - SeekFrom::Current(i) => self.offset = (self.offset as i64 + i) as usize, - SeekFrom::End(i) => self.offset = (self.inner.file_size()? as i64 + i) as usize, - } - Ok(self.offset as u64) - } -} - -impl Read for HedgedReader { - fn read(&mut self, buf: &mut [u8]) -> IoResult { - let len = self.inner.read(self.offset, buf)?; - self.offset += len; - Ok(len) - } -} diff --git a/src/env/hedged/mod.rs b/src/env/hedged/mod.rs new file mode 100644 index 00000000..1f5bc343 --- /dev/null +++ b/src/env/hedged/mod.rs @@ -0,0 +1,602 @@ +// Copyright (c) 2017-present, PingCAP, Inc. Licensed under Apache-2.0. + +use crossbeam::channel::unbounded; +use crossbeam::channel::Receiver; +use log::info; +use std::fs; +use std::io::{Read, Result as IoResult, Seek, SeekFrom, Write}; +use std::path::Path; +use std::path::PathBuf; +use std::sync::atomic::AtomicU64; +use std::sync::atomic::Ordering; +use std::sync::Arc; +use std::thread; +use std::thread::JoinHandle; + +use crate::env::default::LogFd; +use crate::env::DefaultFileSystem; +use crate::env::{FileSystem, Handle, Permission, WriteExt}; +use futures::executor::block_on; +use futures::{join, select}; + +mod recover; +mod runner; +mod sender; +mod task; +mod util; + +use runner::TaskRunner; +use sender::HedgedSender; +use task::{ + empty_callback, paired_future_callback, Callback, FutureHandle, SeqTask, Task, TaskRes, +}; +use util::replace_path; + +pub use sender::State; + +// TODO: add metrics +// TODO: handle specially on config change(upgrade and downgrade) +// TODO: add comment and rename + +pub struct HedgedFileSystem { + base: Arc, + + path1: PathBuf, + path2: PathBuf, + + sender: HedgedSender, + + seqno1: Arc, + seqno2: Arc, + + thread1: Option>, + thread2: Option>, +} + +// TODO: read both dir at recovery, maybe no need? cause operations are to both +// disks TODO: consider encryption +impl HedgedFileSystem { + pub fn new(base: Arc, path1: PathBuf, path2: PathBuf) -> Self { + let (tx1, rx1) = unbounded::<(SeqTask, Callback)>(); + let (tx2, rx2) = unbounded::<(SeqTask, Callback)>(); + let sender = HedgedSender::new(tx1, tx2); + + let seqno1 = Arc::new(AtomicU64::new(0)); + let seqno2 = Arc::new(AtomicU64::new(0)); + let runner1 = TaskRunner::new( + 1, + path1.clone(), + base.clone(), + rx1, + sender.clone(), + seqno1.clone(), + ); + let runner2 = TaskRunner::new( + 2, + path2.clone(), + base.clone(), + rx2, + sender.clone(), + seqno2.clone(), + ); + let thread1 = runner1.spawn(); + let thread2 = runner2.spawn(); + Self { + base, + path1, + path2, + sender, + seqno1, + seqno2, + thread1: Some(thread1), + thread2: Some(thread2), + } + } + + pub fn state(&self) -> State { + self.sender.state() + } + + async fn wait_handle(&self, task1: Task, task2: Task) -> IoResult { + let (cb1, mut f1) = paired_future_callback(); + let (cb2, mut f2) = paired_future_callback(); + self.sender.send(task1.clone(), task2.clone(), cb1, cb2); + + let resolve = |res: TaskRes| -> (LogFd, bool) { + match res { + TaskRes::Create { fd, is_for_rewrite } => (fd, is_for_rewrite), + TaskRes::Open { fd, is_for_rewrite } => (fd, is_for_rewrite), + _ => unreachable!(), + } + }; + select! { + res1 = f1 => res1.unwrap().map(|res| { + let (fd, is_for_rewrite) = resolve(res); + HedgedHandle::new( + self.base.clone(), + is_for_rewrite, + self.sender.clone(), + FutureHandle::new_owned(fd), + FutureHandle::new(f2, task2), + self.seqno1.clone(), + self.seqno2.clone(), + ) + }), + res2 = f2 => res2.unwrap().map(|res| { + let (fd, is_for_rewrite) = resolve(res); + HedgedHandle::new( + self.base.clone(), + is_for_rewrite, + self.sender.clone(), + FutureHandle::new(f1, task1), + FutureHandle::new_owned(fd) , + self.seqno1.clone(), + self.seqno2.clone(), + ) + }), + } + } + + async fn wait(&self, task1: Task, task2: Task) -> IoResult<()> { + let (cb1, mut f1) = paired_future_callback(); + let (cb2, mut f2) = paired_future_callback(); + self.sender.send(task1, task2, cb1, cb2); + + select! { + res1 = f1 => res1.unwrap().map(|_| ()), + res2 = f2 => res2.unwrap().map(|_| ()), + } + } +} + +impl Drop for HedgedFileSystem { + fn drop(&mut self) { + block_on(self.wait(Task::Stop, Task::Stop)).unwrap(); + + let t1 = self.thread1.take().unwrap(); + let t2 = self.thread2.take().unwrap(); + let mut times = 0; + loop { + if t1.is_finished() && t2.is_finished() { + t1.join().unwrap(); + t2.join().unwrap(); + break; + } + times += 1; + if times > 100 { + // wait 1s + // one disk may be blocked for a long time, + // to avoid block shutdown process for a long time, do not join the threads + // here, only need at least to ensure one thread is exited + if t1.is_finished() || t2.is_finished() { + if t1.is_finished() { + t1.join().unwrap(); + } else { + t2.join().unwrap(); + } + break; + } + } + std::thread::sleep(std::time::Duration::from_millis(10)); + } + } +} + +impl FileSystem for HedgedFileSystem { + type Handle = HedgedHandle; + type Reader = HedgedReader; + type Writer = HedgedWriter; + + fn bootstrap(&self) -> IoResult<()> { + // catch up diff + if !self.path1.exists() { + info!("Create raft log directory: {}", self.path1.display()); + fs::create_dir(&self.path1).unwrap(); + } + if !self.path2.exists() { + info!("Create raft log directory: {}", self.path2.display()); + fs::create_dir(&self.path2).unwrap(); + } + let files1 = recover::get_files(&self.path1)?; + let files2 = recover::get_files(&self.path2)?; + + let count1 = recover::get_latest_valid_seq(&self.base, &files1)?; + let count2 = recover::get_latest_valid_seq(&self.base, &files2)?; + + match count1.cmp(&count2) { + std::cmp::Ordering::Equal => { + // still need to catch up, but only diff + recover::catch_up_diff(&self.base, files1, files2, false)?; + return Ok(()); + } + std::cmp::Ordering::Less => { + recover::catch_up_diff(&self.base, files2, files1, false)?; + } + std::cmp::Ordering::Greater => { + recover::catch_up_diff(&self.base, files1, files2, false)?; + } + } + Ok(()) + } + + fn create>(&self, path: P) -> IoResult { + block_on(self.wait_handle( + Task::Create(path.as_ref().to_path_buf()), + Task::Create(replace_path( + path.as_ref(), + self.path1.as_ref(), + self.path2.as_ref(), + )), + )) + } + + fn open>(&self, path: P, perm: Permission) -> IoResult { + block_on(self.wait_handle( + Task::Open { + path: path.as_ref().to_path_buf(), + perm, + }, + Task::Open { + path: replace_path(path.as_ref(), self.path1.as_ref(), self.path2.as_ref()), + perm, + }, + )) + } + + fn delete>(&self, path: P) -> IoResult<()> { + block_on(self.wait( + Task::Delete(path.as_ref().to_path_buf()), + Task::Delete(replace_path( + path.as_ref(), + self.path1.as_ref(), + self.path2.as_ref(), + )), + )) + } + + fn rename>(&self, src_path: P, dst_path: P) -> IoResult<()> { + block_on(self.wait( + Task::Rename { + src_path: src_path.as_ref().to_path_buf(), + dst_path: dst_path.as_ref().to_path_buf(), + }, + Task::Rename { + src_path: replace_path(src_path.as_ref(), self.path1.as_ref(), self.path2.as_ref()), + dst_path: replace_path(dst_path.as_ref(), self.path1.as_ref(), self.path2.as_ref()), + }, + )) + } + + fn new_reader(&self, handle: Arc) -> IoResult { + Ok(HedgedReader::new(handle)) + } + + fn new_writer(&self, handle: Arc) -> IoResult { + Ok(HedgedWriter::new(handle)) + } +} + +pub struct HedgedHandle { + base: Arc, + + // for rewrite file, all the operations should wait both disks finished + strong_consistent: bool, + + sender: HedgedSender, + + handle1: Arc, + handle2: Arc, + + seqno1: Arc, + seqno2: Arc, + + thread1: Option>, + thread2: Option>, +} + +impl HedgedHandle { + fn new( + base: Arc, + strong_consistent: bool, + mut sender: HedgedSender, + handle1: FutureHandle, + handle2: FutureHandle, + mut seqno1: Arc, + mut seqno2: Arc, + ) -> Self { + let mut thread1 = None; + let mut thread2 = None; + if strong_consistent { + // use two separated threads for both wait + let (tx1, rx1) = unbounded::<(SeqTask, Callback)>(); + let (tx2, rx2) = unbounded::<(SeqTask, Callback)>(); + // replace the seqno with self owned, then in `read` the seqno from two disks + // should be always the same. It's just to reuse the logic without + // adding special check in `read` + seqno1 = Arc::new(AtomicU64::new(0)); + seqno2 = Arc::new(AtomicU64::new(0)); + let poll = |(rx, seqno, fs): ( + Receiver<(SeqTask, Callback)>, + Arc, + Arc, + )| { + for (task, cb) in rx { + if let Task::Stop = task.inner { + break; + } + assert!(!matches!(task.inner, Task::Pause | Task::Snapshot)); + let res = task.handle_process(&fs); + seqno.fetch_add(1, Ordering::Relaxed); + cb(res).unwrap(); + } + }; + let args1 = (rx1, seqno1.clone(), base.clone()); + thread1 = Some(thread::spawn(move || { + poll(args1); + })); + let args2 = (rx2, seqno2.clone(), base.clone()); + thread2 = Some(thread::spawn(move || { + poll(args2); + })); + sender = HedgedSender::new(tx1, tx2); + } + + Self { + base, + strong_consistent, + sender, + handle1: Arc::new(handle1), + handle2: Arc::new(handle2), + seqno1, + seqno2, + thread1, + thread2, + } + } + + fn read(&self, offset: usize, buf: &mut [u8]) -> IoResult { + // Raft engine promises that the offset would be read only after the write is + // finished and memtable is updated. And the hedged file system promises that + // the write is done when either one of the disk finishes the write. Here the + // read data must be present in at least one of the disks. So choose the disk of + // largest seqno to read. + // + // Safety: the get for these two seqno is not necessary to be atomic. + // What if the seqno2 is updated after getting seqno1? It's fine, let's say + // - T1 denotes the time of getting seqno1, the actual seqno for disk1 and disk2 + // is S1, S2 + // - T2 denotes the time of getting seqno2, the actual seqno for disk1 and disk2 + // is S1', S2' + // Assume disk2 is just slightly slower than disk1, here is a possible case: + // - T1: S1 = 10, S2 = 9 + // - T2: S1'= 12, S2'= 11 + // Then, what we get would be seq1=10, seq2=11, and the read would be performed + // on disk2. But disk2 is slower than disk1. The data may not be written yet. + // Would the read on a slower disk is safe? + // Yes, it's safe because at T1 we know the data can be read at least with a + // seqno of S1, then at T2, S2' > S1, so the data must be already written in the + // disk2, even if it's the slow disk. + let seq1 = self.seqno1.load(Ordering::Relaxed); + let seq2 = self.seqno2.load(Ordering::Relaxed); + match seq1.cmp(&seq2) { + std::cmp::Ordering::Equal => { + // TODO: read simultaneously from both disks and return the faster one + if let Some(fd) = self.handle1.try_get(&self.base)? { + fd.read(offset, buf) + } else if let Some(fd) = self.handle2.try_get(&self.base)? { + fd.read(offset, buf) + } else { + panic!("Both fd1 and fd2 are None"); + } + } + std::cmp::Ordering::Greater => { + self.handle1.try_get(&self.base)?.unwrap().read(offset, buf) + } + std::cmp::Ordering::Less => { + self.handle2.try_get(&self.base)?.unwrap().read(offset, buf) + } + } + } + + fn write(&self, offset: usize, content: &[u8]) -> IoResult { + block_on(self.wait( + Task::Write { + handle: self.handle1.clone(), + offset, + bytes: content.to_vec(), + }, + Task::Write { + handle: self.handle2.clone(), + offset, + bytes: content.to_vec(), + }, + )) + .map(|res| { + if let TaskRes::Write(size) = res { + size + } else { + unreachable!() + } + }) + } + + fn allocate(&self, offset: usize, size: usize) -> IoResult<()> { + block_on(self.wait( + Task::Allocate { + handle: self.handle1.clone(), + offset, + size, + }, + Task::Allocate { + handle: self.handle2.clone(), + offset, + size, + }, + )) + .map(|_| ()) + } + + async fn wait_one(&self, task1: Task, task2: Task) -> IoResult { + let (cb1, mut f1) = paired_future_callback(); + let (cb2, mut f2) = paired_future_callback(); + self.sender.send(task1, task2, cb1, cb2); + + select! { + res1 = f1 => res1.unwrap(), + res2 = f2 => res2.unwrap(), + } + } + + async fn wait_both(&self, task1: Task, task2: Task) -> IoResult { + let (cb1, f1) = paired_future_callback(); + let (cb2, f2) = paired_future_callback(); + self.sender.send(task1, task2, cb1, cb2); + + let (res1, res2) = join!(f1, f2); + match (res1.unwrap(), res2.unwrap()) { + (res @ Ok(_), Ok(_)) => res, + (Err(e), Err(_)) => Err(e), + (Err(e), _) => Err(e), + (_, Err(e)) => Err(e), + } + } + + async fn wait(&self, task1: Task, task2: Task) -> IoResult { + if self.strong_consistent { + self.wait_both(task1, task2).await + } else { + self.wait_one(task1, task2).await + } + } +} + +impl Handle for HedgedHandle { + fn truncate(&self, offset: usize) -> IoResult<()> { + block_on(self.wait( + Task::Truncate { + handle: self.handle1.clone(), + offset, + }, + Task::Truncate { + handle: self.handle2.clone(), + offset, + }, + )) + .map(|_| ()) + } + + fn file_size(&self) -> IoResult { + block_on(self.wait( + Task::FileSize(self.handle1.clone()), + Task::FileSize(self.handle2.clone()), + )) + .map(|res| { + if let TaskRes::FileSize(size) = res { + size + } else { + unreachable!() + } + }) + } + + fn sync(&self) -> IoResult<()> { + block_on(self.wait( + Task::Sync(self.handle1.clone()), + Task::Sync(self.handle2.clone()), + )) + .map(|_| ()) + } +} + +impl Drop for HedgedHandle { + fn drop(&mut self) { + if self.strong_consistent { + self.sender + .send(Task::Stop, Task::Stop, empty_callback(), empty_callback()); + self.thread1.take().unwrap().join().unwrap(); + self.thread2.take().unwrap().join().unwrap(); + } + } +} + +pub struct HedgedWriter { + inner: Arc, + offset: usize, +} + +impl HedgedWriter { + pub fn new(handle: Arc) -> Self { + Self { + inner: handle, + offset: 0, + } + } +} + +impl Write for HedgedWriter { + fn write(&mut self, buf: &[u8]) -> IoResult { + let len = self.inner.write(self.offset, buf)?; + self.offset += len; + Ok(len) + } + + fn flush(&mut self) -> IoResult<()> { + Ok(()) + } +} + +impl WriteExt for HedgedWriter { + fn truncate(&mut self, offset: usize) -> IoResult<()> { + self.inner.truncate(offset)?; + self.offset = offset; + Ok(()) + } + + fn allocate(&mut self, offset: usize, size: usize) -> IoResult<()> { + self.inner.allocate(offset, size) + } +} + +impl Seek for HedgedWriter { + fn seek(&mut self, pos: SeekFrom) -> IoResult { + match pos { + SeekFrom::Start(offset) => self.offset = offset as usize, + SeekFrom::Current(i) => self.offset = (self.offset as i64 + i) as usize, + SeekFrom::End(i) => self.offset = (self.inner.file_size()? as i64 + i) as usize, + } + Ok(self.offset as u64) + } +} + +pub struct HedgedReader { + inner: Arc, + offset: usize, +} + +impl HedgedReader { + pub fn new(handle: Arc) -> Self { + Self { + inner: handle, + offset: 0, + } + } +} + +impl Seek for HedgedReader { + fn seek(&mut self, pos: SeekFrom) -> IoResult { + match pos { + SeekFrom::Start(offset) => self.offset = offset as usize, + SeekFrom::Current(i) => self.offset = (self.offset as i64 + i) as usize, + SeekFrom::End(i) => self.offset = (self.inner.file_size()? as i64 + i) as usize, + } + Ok(self.offset as u64) + } +} + +impl Read for HedgedReader { + fn read(&mut self, buf: &mut [u8]) -> IoResult { + let len = self.inner.read(self.offset, buf)?; + self.offset += len; + Ok(len) + } +} diff --git a/src/env/hedged/recover.rs b/src/env/hedged/recover.rs new file mode 100644 index 00000000..955aba5f --- /dev/null +++ b/src/env/hedged/recover.rs @@ -0,0 +1,255 @@ +// Copyright (c) 2017-present, PingCAP, Inc. Licensed under Apache-2.0. + +use crate::env::default::LogFile; +use crate::file_pipe_log::log_file::build_file_reader; +use crate::file_pipe_log::pipe_builder::FileName; +use crate::file_pipe_log::reader::LogItemBatchFileReader; +use crate::file_pipe_log::FileNameExt; +use crate::internals::parse_reserved_file_name; +use crate::internals::FileId; +use crate::internals::LogQueue; +use crate::Error; +use std::fs; +use std::io::Result as IoResult; +use std::path::PathBuf; +use std::sync::Arc; + +use crate::env::default::LogFd; +use crate::env::DefaultFileSystem; +use crate::env::{FileSystem, Permission}; + +use super::util::replace_path; + +#[derive(Default)] +pub(crate) struct Files { + pub prefix: PathBuf, + pub append_files: Vec, + pub rewrite_files: Vec, + pub reserved_files: Vec, +} + +pub(crate) enum SeqFile { + Path(FileName), + Handle((FileName, Arc)), +} + +impl SeqFile { + pub fn seq(&self) -> u64 { + match self { + SeqFile::Path(f) => f.seq, + SeqFile::Handle((f, _)) => f.seq, + } + } + + pub fn path(&self) -> &PathBuf { + match self { + SeqFile::Path(f) => &f.path, + SeqFile::Handle((f, _)) => &f.path, + } + } + + pub fn remove(&self) -> IoResult<()> { + match self { + SeqFile::Path(f) => fs::remove_file(&f.path), + SeqFile::Handle((f, _)) => fs::remove_file(&f.path), + } + } + + pub fn copy(&self, file_system: &DefaultFileSystem, to: &PathBuf) -> IoResult { + match self { + SeqFile::Path(f) => fs::copy(&f.path, to.as_path()), + SeqFile::Handle((_, fd)) => { + let mut reader = LogFile::new(fd.clone()); + let mut writer = LogFile::new(Arc::new(file_system.create(to)?)); + std::io::copy(&mut reader, &mut writer) + } + } + } + + pub fn into_handle(mut self, file_system: &DefaultFileSystem) -> Self { + if let SeqFile::Path(f) = self { + let fd = Arc::new(file_system.open(&f.path, Permission::ReadOnly).unwrap()); + self = SeqFile::Handle((f, fd)); + } + self + } +} + +pub(crate) fn catch_up_diff( + file_system: &Arc, + mut from_files: Files, + mut to_files: Files, + skip_rewrite: bool, +) -> IoResult<()> { + from_files + .append_files + .sort_by(|a, b| a.seq().cmp(&b.seq())); + to_files.append_files.sort_by(|a, b| a.seq().cmp(&b.seq())); + from_files + .rewrite_files + .sort_by(|a, b| a.seq().cmp(&b.seq())); + to_files.rewrite_files.sort_by(|a, b| a.seq().cmp(&b.seq())); + from_files + .reserved_files + .sort_by(|a, b| a.seq().cmp(&b.seq())); + to_files + .reserved_files + .sort_by(|a, b| a.seq().cmp(&b.seq())); + + let check_files = |from: &Vec, to: &Vec| -> IoResult<()> { + let last_from_seq = from.last().map(|f| f.seq()).unwrap_or(0); + + let mut iter1 = from.iter().peekable(); + let mut iter2 = to.iter().peekable(); + // compare files of from and to, if the file in from is not in to, copy it to + // to, and if the file in to is not in from, delete it + loop { + match (iter1.peek(), iter2.peek()) { + (None, None) => break, + (Some(f1), None) => { + let to = replace_path( + f1.path().as_ref(), + from_files.prefix.as_ref(), + to_files.prefix.as_ref(), + ); + f1.copy(file_system, &to)?; + iter1.next(); + } + (None, Some(f2)) => { + f2.remove()?; + iter2.next(); + } + (Some(f1), Some(f2)) => { + match f1.seq().cmp(&f2.seq()) { + std::cmp::Ordering::Equal => { + // check file size is not enough, treat the last files differently + // considering the recycle, always copy the last file + // TODO: only copy diff part + if f1.seq() == last_from_seq { + let to = replace_path( + f1.path().as_ref(), + from_files.prefix.as_ref(), + to_files.prefix.as_ref(), + ); + f1.copy(file_system, &to)?; + } + iter1.next(); + iter2.next(); + } + std::cmp::Ordering::Less => { + let to = replace_path( + f1.path().as_ref(), + from_files.prefix.as_ref(), + to_files.prefix.as_ref(), + ); + f1.copy(file_system, &to)?; + iter1.next(); + } + std::cmp::Ordering::Greater => { + f2.remove()?; + iter2.next(); + } + } + } + } + } + Ok(()) + }; + + check_files(&from_files.append_files, &to_files.append_files)?; + if !skip_rewrite { + check_files(&from_files.rewrite_files, &to_files.rewrite_files)?; + } + check_files(&from_files.reserved_files, &to_files.reserved_files)?; + Ok(()) +} + +pub(crate) fn get_files(path: &PathBuf) -> IoResult { + assert!(path.exists()); + + let mut files = Files { + prefix: path.clone(), + ..Default::default() + }; + + fs::read_dir(path) + .unwrap() + .try_for_each(|e| -> IoResult<()> { + let dir_entry = e?; + let p = dir_entry.path(); + if !p.is_file() { + return Ok(()); + } + let file_name = p.file_name().unwrap().to_str().unwrap(); + match FileId::parse_file_name(file_name) { + Some(FileId { + queue: LogQueue::Append, + seq, + }) => files.append_files.push(SeqFile::Path(FileName { + seq, + path: p, + path_id: 0, + })), + Some(FileId { + queue: LogQueue::Rewrite, + seq, + }) => files.rewrite_files.push(SeqFile::Path(FileName { + seq, + path: p, + path_id: 0, + })), + _ => { + if let Some(seq) = parse_reserved_file_name(file_name) { + files.reserved_files.push(SeqFile::Path(FileName { + seq, + path: p, + path_id: 0, + })) + } + } + } + Ok(()) + }) + .unwrap(); + + Ok(files) +} + +pub(crate) fn get_latest_valid_seq( + file_system: &Arc, + files: &Files, +) -> IoResult { + let mut count = 0; + if let Some(f) = files.append_files.last() { + let recovery_read_block_size = 1024; + let mut reader = LogItemBatchFileReader::new(recovery_read_block_size); + let handle = Arc::new(file_system.open(&f.path(), Permission::ReadOnly)?); + let file_reader = build_file_reader(file_system.as_ref(), handle)?; + match reader.open( + FileId { + queue: LogQueue::Append, + seq: f.seq(), + }, + file_reader, + ) { + Err(e) => match e { + Error::Io(err) => return Err(err), + _ => return Ok(0), + }, + Ok(_) => { + // Do nothing + } + } + loop { + match reader.next() { + Ok(Some(_)) => { + count += 1; + } + Ok(None) => break, + Err(_) => break, + } + } + } + + Ok(count) +} diff --git a/src/env/hedged/runner.rs b/src/env/hedged/runner.rs new file mode 100644 index 00000000..12324db4 --- /dev/null +++ b/src/env/hedged/runner.rs @@ -0,0 +1,125 @@ +// Copyright (c) 2017-present, PingCAP, Inc. Licensed under Apache-2.0. + +use crossbeam::channel::Receiver; +use fail::fail_point; +use std::io::Result as IoResult; +use std::path::PathBuf; +use std::sync::atomic::AtomicU64; +use std::sync::atomic::Ordering; +use std::sync::Arc; +use std::thread; +use std::thread::JoinHandle; + +use crate::env::DefaultFileSystem; +use futures::executor::block_on; + +use super::recover; +use super::sender::HedgedSender; +use super::task::paired_future_callback; +use super::task::{Callback, SeqTask, Task, TaskRes}; + +pub(crate) struct TaskRunner { + id: u8, + path: PathBuf, + fs: Arc, + rx: Receiver<(SeqTask, Callback)>, + sender: HedgedSender, + seqno: Arc, +} + +impl TaskRunner { + pub fn new( + id: u8, + path: PathBuf, + fs: Arc, + rx: Receiver<(SeqTask, Callback)>, + sender: HedgedSender, + seqno: Arc, + ) -> Self { + Self { + id, + path, + fs, + rx, + sender, + seqno, + } + } + + pub fn spawn(self) -> JoinHandle<()> { + let id = self.id; + thread::Builder::new() + .name(format!("raft-engine-disk{}", id)) + .spawn(move || { + if let Err(e) = self.poll() { + panic!("disk {} failed: {:?}", id, e); + } + }) + .unwrap() + } + + fn poll(self) -> IoResult<()> { + let mut last_seq = 0; + let mut snap_seq = None; + for (task, cb) in self.rx { + if let Task::Stop = task.inner { + cb(Ok(TaskRes::Stop))?; + break; + } + if let Task::Pause = task.inner { + // Encountering `Pause`, indicate the disk may not slow anymore + let (cb, f) = paired_future_callback(); + self.sender.send_snapshot(cb); + let to_files = recover::get_files(&self.path)?; + let from_files = block_on(f).unwrap().map(|res| { + if let TaskRes::Snapshot((seq, files)) = res { + snap_seq = Some(seq); + files + } else { + unreachable!() + } + })?; + + // Snapshot doesn't include the file size, so it would copy more data than + // the data seen at the time of snapshot. But it's okay, as the data is + // written with specific offset, so the data written + // of no necessity will be overwritten by the latter writes. + // Exclude rewrite files because rewrite files are always synced. + recover::catch_up_diff(&self.fs, from_files, to_files, true)?; + + self.sender.finish_snapshot(); + self.seqno.store(snap_seq.unwrap(), Ordering::Relaxed); + last_seq = snap_seq.unwrap(); + continue; + } + if self.id == 1 { + fail_point!("hedged::task_runner::thread1"); + } + let seq = task.seq; + assert_ne!(seq, 0); + if let Some(snap) = snap_seq.as_ref() { + // the change already included in the snapshot + if seq < *snap { + continue; + } else if seq == *snap { + unreachable!(); + } else if seq == *snap + 1 { + snap_seq = None; + } else { + panic!("seqno {} is larger than snapshot seqno {}", seq, *snap); + } + } + + assert_eq!(last_seq + 1, seq); + last_seq = seq; + let res = task.process(&self.fs, &self.path); + // seqno should be updated before the write callback is called, otherwise one + // read may be performed right after the write is finished. Then the read may be + // performed on the other disk not having the data because the seqno for this + // disk is not updated yet. + self.seqno.store(seq, Ordering::Relaxed); + cb(res)?; + } + Ok(()) + } +} diff --git a/src/env/hedged/sender.rs b/src/env/hedged/sender.rs new file mode 100644 index 00000000..75c90134 --- /dev/null +++ b/src/env/hedged/sender.rs @@ -0,0 +1,140 @@ +// Copyright (c) 2017-present, PingCAP, Inc. Licensed under Apache-2.0. + +use crossbeam::channel::Sender; +use fail::fail_point; +use std::sync::Arc; +use std::sync::Mutex; + +use super::task::{empty_callback, Callback, SeqTask, Task}; + +// let say the average entry size is 100B, then the total size of the log in the +// channel is 1GB, +const PAUSE_THRESHOLD: usize = 10000; + +fn get_pause_threshold() -> usize { + fail_point!("hedged::pause_threshold", |s| s + .unwrap() + .parse::() + .unwrap()); + PAUSE_THRESHOLD +} + +#[derive(Debug, PartialEq, Clone)] +pub enum State { + Normal, + Paused1, /* When the length of channel of disk1 reaches threshold, a + * `Pause` task is sent and no more later task will be sent + * to disk1 */ + Paused2, // no more task will be sent to disk2 + Recovering, +} + +// Make sure the task is sent to two disks' channel atomically, otherwise the +// ordering of the tasks in two disks' channels are not same. +#[derive(Clone)] +pub(crate) struct HedgedSender(Arc>); + +struct HedgedSenderInner { + disk1: Sender<(SeqTask, Callback)>, + disk2: Sender<(SeqTask, Callback)>, + seq: u64, + state: State, +} + +impl HedgedSender { + pub fn new(disk1: Sender<(SeqTask, Callback)>, disk2: Sender<(SeqTask, Callback)>) -> Self { + Self(Arc::new(Mutex::new(HedgedSenderInner { + disk1, + disk2, + seq: 0, + state: State::Normal, + }))) + } + + pub fn state(&self) -> State { + self.0.lock().unwrap().state.clone() + } + + pub fn send(&self, task1: Task, task2: Task, cb1: Callback, cb2: Callback) { + if matches!(task1, Task::Pause | Task::Snapshot) { + unreachable!(); + } + + let mut inner = self.0.lock().unwrap(); + inner.seq += 1; + let task1 = SeqTask { + inner: task1, + seq: inner.seq, + }; + let task2 = SeqTask { + inner: task2, + seq: inner.seq, + }; + if matches!(inner.state, State::Normal) { + let check1 = inner.disk1.len() > get_pause_threshold(); + let check2 = inner.disk2.len() > get_pause_threshold(); + match (check1, check2) { + (true, true) => { + panic!("Both channels of disk1 and disk2 are full") + } + (true, false) => { + inner.state = State::Paused1; + inner + .disk1 + .send(( + SeqTask { + inner: Task::Pause, + seq: 0, + }, + empty_callback(), + )) + .unwrap(); + } + (false, true) => { + inner.state = State::Paused2; + inner + .disk2 + .send(( + SeqTask { + inner: Task::Pause, + seq: 0, + }, + empty_callback(), + )) + .unwrap(); + } + _ => {} + } + } + if !matches!(inner.state, State::Paused1) { + inner.disk1.send((task1, cb1)).unwrap(); + } + if !matches!(inner.state, State::Paused2) { + inner.disk2.send((task2, cb2)).unwrap(); + } + } + + pub fn send_snapshot(&self, cb: Callback) { + let mut inner = self.0.lock().unwrap(); + inner.seq += 1; + let task = SeqTask { + inner: Task::Snapshot, + seq: inner.seq, + }; + match inner.state { + State::Paused1 => { + inner.disk2.send((task, cb)).unwrap(); + } + State::Paused2 => { + inner.disk1.send((task, cb)).unwrap(); + } + _ => unreachable!(), + } + inner.state = State::Recovering; + } + + pub fn finish_snapshot(&self) { + let mut inner = self.0.lock().unwrap(); + inner.state = State::Normal; + } +} diff --git a/src/env/hedged/task.rs b/src/env/hedged/task.rs new file mode 100644 index 00000000..d2fad9e6 --- /dev/null +++ b/src/env/hedged/task.rs @@ -0,0 +1,249 @@ +// Copyright (c) 2017-present, PingCAP, Inc. Licensed under Apache-2.0. + +use std::cell::UnsafeCell; +use std::io::Result as IoResult; +use std::path::PathBuf; +use std::sync::Arc; + +use crate::env::default::LogFd; +use crate::env::DefaultFileSystem; +use crate::env::{FileSystem, Handle, Permission}; +use futures::channel::oneshot::{self, Canceled}; +use futures::executor::block_on; + +use super::recover::{self, Files}; + +use either::Either; + +pub(crate) type Callback = Box) -> IoResult + Send>; + +pub(crate) fn empty_callback() -> Callback { + Box::new(|_| Ok(TaskRes::Noop)) +} + +pub(crate) fn paired_future_callback() -> (Callback, oneshot::Receiver>) { + let (tx, future) = oneshot::channel(); + let callback = Box::new(move |result| -> IoResult { + if let Err(result) = tx.send(result) { + return result; + } + Ok(TaskRes::Noop) + }); + (callback, future) +} + +pub(crate) struct SeqTask { + pub inner: Task, + pub seq: u64, +} + +#[derive(Clone)] +pub(crate) enum Task { + Create(PathBuf), + Open { + path: PathBuf, + perm: Permission, + }, + Delete(PathBuf), + Rename { + src_path: PathBuf, + dst_path: PathBuf, + }, + Truncate { + handle: Arc, + offset: usize, + }, + FileSize(Arc), + Sync(Arc), + Write { + handle: Arc, + offset: usize, + bytes: Vec, + }, + Allocate { + handle: Arc, + offset: usize, + size: usize, + }, + Pause, + Snapshot, + Stop, +} + +impl SeqTask { + pub fn process(self, file_system: &DefaultFileSystem, path: &PathBuf) -> IoResult { + match self.inner { + Task::Create(path) => file_system.create(&path).map(|h| TaskRes::Create { + fd: h, + is_for_rewrite: path.extension().map_or(false, |ext| ext == "rewrite"), + }), + Task::Open { path, perm } => file_system.open(&path, perm).map(|h| TaskRes::Open { + fd: h, + is_for_rewrite: path.extension().map_or(false, |ext| ext == "rewrite"), + }), + Task::Delete(path) => file_system.delete(path).map(|_| TaskRes::Delete), + Task::Rename { src_path, dst_path } => file_system + .rename(src_path, dst_path) + .map(|_| TaskRes::Rename), + Task::Snapshot => { + let mut files = recover::get_files(&path)?; + files.append_files = files + .append_files + .into_iter() + .map(|f| f.into_handle(file_system)) + .collect(); + // exclude rewrite files, as they are always synced + files.reserved_files = files + .reserved_files + .into_iter() + .map(|f| f.into_handle(file_system)) + .collect(); + Ok(TaskRes::Snapshot((self.seq, files))) + } + Task::Stop | Task::Pause => unreachable!(), + _ => self.handle_process(file_system), + } + } + + pub fn handle_process(self, file_system: &DefaultFileSystem) -> IoResult { + match self.inner { + Task::Truncate { handle, offset } => handle + .get(file_system)? + .truncate(offset) + .map(|_| TaskRes::Truncate), + Task::FileSize(handle) => handle + .get(file_system)? + .file_size() + .map(|s| TaskRes::FileSize(s)), + Task::Sync(handle) => handle.get(file_system)?.sync().map(|_| TaskRes::Sync), + Task::Write { + handle, + offset, + bytes, + } => handle + .get(file_system)? + .write(offset, &bytes) + .map(|s| TaskRes::Write(s)), + Task::Allocate { + handle, + offset, + size, + } => handle + .get(file_system)? + .allocate(offset, size) + .map(|_| TaskRes::Allocate), + _ => unreachable!(), + } + } +} + +pub(crate) enum TaskRes { + Noop, + Create { fd: LogFd, is_for_rewrite: bool }, + Open { fd: LogFd, is_for_rewrite: bool }, + Delete, + Rename, + Truncate, + FileSize(usize), + Sync, + Write(usize), + Allocate, + Snapshot((u64, Files)), + Stop, +} + +pub(crate) struct FutureHandle { + inner: UnsafeCell>, Arc>>, + task: Option, +} + +unsafe impl Send for FutureHandle {} + +// To avoid using `Mutex` +// Safety: +// For write, all writes are serialized to one channel, so only one thread will +// update the inner. For read, multiple readers and one writer and may visit +// try_get() concurrently to get the fd from receiver. The receiver is `Sync`, +// so only one of them will get the fd, and update the inner to Arc. +unsafe impl Sync for FutureHandle {} + +impl FutureHandle { + pub fn new(rx: oneshot::Receiver>, task: Task) -> Self { + Self { + inner: UnsafeCell::new(Either::Left(rx)), + task: Some(task), + } + } + pub fn new_owned(h: LogFd) -> Self { + Self { + inner: UnsafeCell::new(Either::Right(Arc::new(h))), + task: None, + } + } + + pub fn get(&self, file_system: &DefaultFileSystem) -> IoResult> { + let mut set = false; + let fd = match unsafe { &mut *self.inner.get() } { + Either::Left(rx) => { + set = true; + match block_on(rx) { + Err(Canceled) => self.retry_canceled(file_system)?, + Ok(res) => match res? { + TaskRes::Open { fd, .. } => Arc::new(fd), + TaskRes::Create { fd, .. } => Arc::new(fd), + _ => unreachable!(), + }, + } + } + Either::Right(w) => w.clone(), + }; + if set { + unsafe { + *self.inner.get() = Either::Right(fd.clone()); + } + } + Ok(fd) + } + + pub fn try_get(&self, file_system: &DefaultFileSystem) -> IoResult>> { + let mut set = false; + let fd = match unsafe { &mut *self.inner.get() } { + Either::Left(rx) => { + set = true; + match rx.try_recv() { + Err(Canceled) => self.retry_canceled(file_system)?, + Ok(None) => return Ok(None), + Ok(Some(res)) => match res? { + TaskRes::Open { fd, .. } => Arc::new(fd), + TaskRes::Create { fd, .. } => Arc::new(fd), + _ => unreachable!(), + }, + } + } + Either::Right(w) => w.clone(), + }; + if set { + unsafe { + *self.inner.get() = Either::Right(fd.clone()); + } + } + Ok(Some(fd)) + } + + fn retry_canceled(&self, file_system: &DefaultFileSystem) -> IoResult> { + // Canceled is caused by the task is dropped when in paused state, + // so we should retry the task now + Ok(match self.task.as_ref().unwrap() { + Task::Create(path) => { + // has been already created, so just open + let fd = file_system.open(path, Permission::ReadWrite)?; + Arc::new(fd) + } + Task::Open { path, perm } => { + let fd = file_system.open(path, *perm)?; + Arc::new(fd) + } + _ => unreachable!(), + }) + } +} diff --git a/src/env/hedged/util.rs b/src/env/hedged/util.rs new file mode 100644 index 00000000..f5f5b9f1 --- /dev/null +++ b/src/env/hedged/util.rs @@ -0,0 +1,9 @@ +use std::path::{Path, PathBuf}; + +pub fn replace_path(path: &Path, from: &Path, to: &Path) -> PathBuf { + if let Ok(file) = path.strip_prefix(from) { + to.to_path_buf().join(file) + } else { + panic!("Invalid path: {:?}", path); + } +} From 2518b1002c0f7a25cc49f0a3cb1eb23057d5e8de Mon Sep 17 00:00:00 2001 From: Connor1996 Date: Mon, 18 Sep 2023 14:34:57 +0800 Subject: [PATCH 29/32] add comment Signed-off-by: Connor1996 --- src/env/hedged/mod.rs | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/src/env/hedged/mod.rs b/src/env/hedged/mod.rs index 1f5bc343..439cdf98 100644 --- a/src/env/hedged/mod.rs +++ b/src/env/hedged/mod.rs @@ -36,7 +36,35 @@ pub use sender::State; // TODO: add metrics // TODO: handle specially on config change(upgrade and downgrade) -// TODO: add comment and rename + +// In cloud environment, cloud disk IO may get stuck for a while due to cloud +// vendor infrastructure issues. This may affect the foreground latency +// dramatically. Raft log apply doesn't sync mostly, so it wouldn't be a +// problem. While raft log append is synced every time. To alleviate that, we +// can hedge raft log to two different cloud disks. If either one of them is +// synced, the raft log append is considered finished. Thus when one of the +// cloud disk IO is stuck, the other one can still work and doesn't affect +// foreground write flow. + +//Under the hood, the HedgedFileSystem manages two directories on different +// cloud disks. All operations of the interface are serialized by one channel +// for each disk and wait until either one of the channels is consumed. With +// that, if one of the disk's io is slow for a long time, the other can still +// serve the operations without any delay. And once the disk comes back to +// normal, it can catch up with the accumulated operations record in the +// channel. Then the states of the two disks can be synced again. + +// It relays on some raft-engine assumptions: +// 1. Raft log is append only. +// 2. Raft log is read-only once it's sealed. + +// For raft engine write thread model, only one thread writes WAL at one point. +// So not supporting writing WAL concurrently is not a big deal. But for the +// rewrite(GC), it is concurrent to WAL write. Making GC write operations +// serialized with WAL write may affect the performance pretty much. To avoid +// that, we can treat rewrite files especially that make rewrite operations wait +// both disks because rewrite is a background job that doesn’t affect foreground +// latency. As the rewrite files are the partial order pub struct HedgedFileSystem { base: Arc, @@ -53,8 +81,7 @@ pub struct HedgedFileSystem { thread2: Option>, } -// TODO: read both dir at recovery, maybe no need? cause operations are to both -// disks TODO: consider encryption +// TODO: consider encryption impl HedgedFileSystem { pub fn new(base: Arc, path1: PathBuf, path2: PathBuf) -> Self { let (tx1, rx1) = unbounded::<(SeqTask, Callback)>(); From 56c40dcd336bd04bb5ca2f67e3d2bfc48ccc251f Mon Sep 17 00:00:00 2001 From: Connor1996 Date: Mon, 18 Sep 2023 17:54:27 +0800 Subject: [PATCH 30/32] add comment Signed-off-by: Connor1996 --- src/env/hedged/mod.rs | 123 ++++++++++++++++++++++++++------------- src/env/hedged/runner.rs | 2 + src/env/hedged/task.rs | 11 +++- 3 files changed, 94 insertions(+), 42 deletions(-) diff --git a/src/env/hedged/mod.rs b/src/env/hedged/mod.rs index 439cdf98..f0f8f6d8 100644 --- a/src/env/hedged/mod.rs +++ b/src/env/hedged/mod.rs @@ -34,38 +34,77 @@ use util::replace_path; pub use sender::State; +/// In cloud environment, cloud disk IO may get stuck for a while due to cloud +/// vendor infrastructure issues. This may affect the foreground latency +/// dramatically. Raft log apply doesn't sync mostly, so it wouldn't be a +/// problem. While raft log append is synced every time. To alleviate that, we +/// can write raft log to two different cloud disks. If either one of them is +/// synced, the raft log append is considered finished. Thus when one of the +/// cloud disk IO is stuck, the other one can still work and doesn't affect +/// foreground write flow. +/// +/// Under the hood, the file system manages two directories on different +/// cloud disks. All operations are serialized as tasks attached with a +/// monotonic increasing sequence number in the channel. The task is sent to +/// both disks and wait until either one of the channels consumes the task. With +/// that, if one of the disk's io is slow for a long time, the other can still +/// serve the operations without any delay. As we know, if two state machines +/// apply several changes in same order with same initial state, the final state +/// of the two state machines must be the same. So once the slow disk comes back +/// normal, it can catch up with the accumulated operations record in the +/// channel. Then the contents of the two disks can be identical still. +/// +/// For raft engine write thread model, only one thread writes WAL at one point. +/// So not supporting writing WAL concurrently is not a big deal. But for the +/// rewrite(GC), it is concurrent to WAL write. Making GC write operations +/// serialized with WAL write may affect the performance pretty much as write io +/// is performed only by one thread. For performance consideration, we can treat +/// rewrite files especially that make rewrite operations wait both disks +/// because rewrite is a background job that doesn’t affect foreground latency. +/// +/// For read, for performance consideration, it doesn't serialize read +/// operations to disks' channel. It just reads from the disk which has handled +/// task of larger sequence number. +/// +/// But what if it's blocking for a long time or down, then the infinite +/// accumulated operations record would exhaust the memory causing OOM. To avoid +/// that, once the channel piles up to some extent, it just abandons that disk, +/// which sends an `Pause` task to the channel and does not send tasks to that +/// disk's channel anymore, while operations on the rewrite files' is not paused +/// and still wait on both disks. So it has the chance that one disk written a +/// rewrite entry while the origin entry is not written yet. But it's fine as +/// rewrite read is performed on the faster disk which has the origin entry. +/// +/// And later on, when the blocked disk comes back to normal, it would +/// finally consume to the end of channel where is the `Pause` task indicating +/// the disk is recovered. Then the slow disk would request a snapshot from +/// faster disk, let the normal disk copy all the append files to that disk. +/// And new changes resume being sent to the disk's channel. With that, all the +/// writes to the new mutable append file during the copy won't be lost, and +/// two disks can be synced eventually. +/// +/// As the raft log file number increases monotonically, after a restart, it can +/// check the latest append(exclude rewrite files) file number to decide +/// which disk has the latest data. If the largest file numbers are the same, we +/// can't simply use the file size to decide due to file recycling. Instead, +/// scan the two files to find the biggest offset of the latest and valid record +/// in them. And finally, regarding the disk of the bigger one as the latest. +/// The latest one is responsible to sync the missing append files to the other +/// disk. +/// +/// It relays on some raft-engine invariants: +/// 1. Raft log is append only. +/// 2. Raft log is read-only once it's sealed. +/// Here is the TLA+ proof https://github.com/pingcap/tla-plus/pull/41 + // TODO: add metrics // TODO: handle specially on config change(upgrade and downgrade) +// TODO: fallback to one disk, if the other disk is down for a long time and +// close to full +// TODO: consider encryption -// In cloud environment, cloud disk IO may get stuck for a while due to cloud -// vendor infrastructure issues. This may affect the foreground latency -// dramatically. Raft log apply doesn't sync mostly, so it wouldn't be a -// problem. While raft log append is synced every time. To alleviate that, we -// can hedge raft log to two different cloud disks. If either one of them is -// synced, the raft log append is considered finished. Thus when one of the -// cloud disk IO is stuck, the other one can still work and doesn't affect -// foreground write flow. - -//Under the hood, the HedgedFileSystem manages two directories on different -// cloud disks. All operations of the interface are serialized by one channel -// for each disk and wait until either one of the channels is consumed. With -// that, if one of the disk's io is slow for a long time, the other can still -// serve the operations without any delay. And once the disk comes back to -// normal, it can catch up with the accumulated operations record in the -// channel. Then the states of the two disks can be synced again. - -// It relays on some raft-engine assumptions: -// 1. Raft log is append only. -// 2. Raft log is read-only once it's sealed. - -// For raft engine write thread model, only one thread writes WAL at one point. -// So not supporting writing WAL concurrently is not a big deal. But for the -// rewrite(GC), it is concurrent to WAL write. Making GC write operations -// serialized with WAL write may affect the performance pretty much. To avoid -// that, we can treat rewrite files especially that make rewrite operations wait -// both disks because rewrite is a background job that doesn’t affect foreground -// latency. As the rewrite files are the partial order - +// All operations of file system trait are sent to the channels of both disks +// through `HedgedFileSystem`. pub struct HedgedFileSystem { base: Arc, @@ -81,7 +120,6 @@ pub struct HedgedFileSystem { thread2: Option>, } -// TODO: consider encryption impl HedgedFileSystem { pub fn new(base: Arc, path1: PathBuf, path2: PathBuf) -> Self { let (tx1, rx1) = unbounded::<(SeqTask, Callback)>(); @@ -303,20 +341,25 @@ impl FileSystem for HedgedFileSystem { } } +// HedgedHandle wraps two handles, and send the same operation to both disk's +// file handles pub struct HedgedHandle { base: Arc, - // for rewrite file, all the operations should wait both disks finished - strong_consistent: bool, - sender: HedgedSender, + // The two file handles for each disk handle1: Arc, handle2: Arc, + // The sequence number of the latest handled task for each disk seqno1: Arc, seqno2: Arc, + // For rewrite file, all the operations should wait both disks finished + rewrite_file: bool, + // The two threads for handling operations on rewrite files which is separated from the disk + // channel thread for performance consideration thread1: Option>, thread2: Option>, } @@ -324,7 +367,7 @@ pub struct HedgedHandle { impl HedgedHandle { fn new( base: Arc, - strong_consistent: bool, + rewrite_file: bool, mut sender: HedgedSender, handle1: FutureHandle, handle2: FutureHandle, @@ -333,13 +376,13 @@ impl HedgedHandle { ) -> Self { let mut thread1 = None; let mut thread2 = None; - if strong_consistent { + if rewrite_file { // use two separated threads for both wait let (tx1, rx1) = unbounded::<(SeqTask, Callback)>(); let (tx2, rx2) = unbounded::<(SeqTask, Callback)>(); // replace the seqno with self owned, then in `read` the seqno from two disks - // should be always the same. It's just to reuse the logic without - // adding special check in `read` + // should be always the same. It's just to reuse the logic without adding + // special check in `read` seqno1 = Arc::new(AtomicU64::new(0)); seqno2 = Arc::new(AtomicU64::new(0)); let poll = |(rx, seqno, fs): ( @@ -370,7 +413,7 @@ impl HedgedHandle { Self { base, - strong_consistent, + rewrite_file, sender, handle1: Arc::new(handle1), handle2: Arc::new(handle2), @@ -489,7 +532,7 @@ impl HedgedHandle { } async fn wait(&self, task1: Task, task2: Task) -> IoResult { - if self.strong_consistent { + if self.rewrite_file { self.wait_both(task1, task2).await } else { self.wait_one(task1, task2).await @@ -537,7 +580,7 @@ impl Handle for HedgedHandle { impl Drop for HedgedHandle { fn drop(&mut self) { - if self.strong_consistent { + if self.rewrite_file { self.sender .send(Task::Stop, Task::Stop, empty_callback(), empty_callback()); self.thread1.take().unwrap().join().unwrap(); diff --git a/src/env/hedged/runner.rs b/src/env/hedged/runner.rs index 12324db4..031051ef 100644 --- a/src/env/hedged/runner.rs +++ b/src/env/hedged/runner.rs @@ -18,6 +18,8 @@ use super::sender::HedgedSender; use super::task::paired_future_callback; use super::task::{Callback, SeqTask, Task, TaskRes}; +// TaskRunner is a thread runner that handles the disk IO tasks. It would poll +// the channel until receiving a `Stop` task. pub(crate) struct TaskRunner { id: u8, path: PathBuf, diff --git a/src/env/hedged/task.rs b/src/env/hedged/task.rs index d2fad9e6..70b169da 100644 --- a/src/env/hedged/task.rs +++ b/src/env/hedged/task.rs @@ -152,6 +152,11 @@ pub(crate) enum TaskRes { Stop, } +// A helper struct to get the fd from callback in the future. As on the creation +// of `HedgedHandle`, only one of the fd is retrieved at the time, so we can +// need to check the receiver to get the fd for the later get. If the fd is +// ready, we will update the inner to `Arc`, so the later get will get +// the fd directly. pub(crate) struct FutureHandle { inner: UnsafeCell>, Arc>>, task: Option, @@ -187,6 +192,8 @@ impl FutureHandle { Either::Left(rx) => { set = true; match block_on(rx) { + // Canceled is caused by the task is dropped when in paused state, + // so we should retry the task now Err(Canceled) => self.retry_canceled(file_system)?, Ok(res) => match res? { TaskRes::Open { fd, .. } => Arc::new(fd), @@ -211,6 +218,8 @@ impl FutureHandle { Either::Left(rx) => { set = true; match rx.try_recv() { + // Canceled is caused by the task is dropped when in paused state, + // so we should retry the task now Err(Canceled) => self.retry_canceled(file_system)?, Ok(None) => return Ok(None), Ok(Some(res)) => match res? { @@ -231,8 +240,6 @@ impl FutureHandle { } fn retry_canceled(&self, file_system: &DefaultFileSystem) -> IoResult> { - // Canceled is caused by the task is dropped when in paused state, - // so we should retry the task now Ok(match self.task.as_ref().unwrap() { Task::Create(path) => { // has been already created, so just open From 58132d399b66988b36323de86858d56f7342d2ea Mon Sep 17 00:00:00 2001 From: Connor1996 Date: Mon, 18 Sep 2023 17:58:45 +0800 Subject: [PATCH 31/32] clean Signed-off-by: Connor1996 --- src/engine.rs | 47 +-------------------------------------- src/env/hedged/mod.rs | 1 - src/env/mod.rs | 2 +- src/file_pipe_log/pipe.rs | 5 ----- src/purge.rs | 6 ----- 5 files changed, 2 insertions(+), 59 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index cccbf9b4..dab1b5b4 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -10,6 +10,7 @@ use std::time::{Duration, Instant}; use log::{error, info}; use protobuf::{parse_from_bytes, Message}; +use crate::config::{Config, RecoveryMode}; use crate::consistency::ConsistencyChecker; use crate::env::{DefaultFileSystem, FileSystem}; use crate::event_listener::EventListener; @@ -21,40 +22,12 @@ use crate::metrics::*; use crate::pipe_log::{FileBlockHandle, FileId, LogQueue, PipeLog}; use crate::purge::{PurgeHook, PurgeManager}; use crate::write_barrier::{WriteBarrier, Writer}; -use crate::{ - config::{Config, RecoveryMode}, - env::HedgedFileSystem, -}; use crate::{perf_context, Error, GlobalStats, Result}; const METRICS_FLUSH_INTERVAL: Duration = Duration::from_secs(30); /// Max times for `write`. const MAX_WRITE_ATTEMPT: u64 = 2; -// pub struct HedgedEngine> -// where -// F: FileSystem, -// P: PipeLog, -// { -// inner: Engine, P>, -// fs: Arc>, -// } - -// impl HedgedEngine> -// where -// F: FileSystem, -// { - -// } - -// impl Deref for HedgedEngine> { -// type Target = Engine>; - -// fn deref(&self) -> &Self::Target { -// &self.inner -// } -// } - pub struct Engine> where F: FileSystem, @@ -90,24 +63,6 @@ impl Engine> { } } -pub fn open_with_hedged_file_system( - cfg: Config, - file_system: Arc, -) -> Result>> { - let file_system = if let Some(ref sec_dir) = cfg.second_dir { - let fs = Arc::new(HedgedFileSystem::new( - file_system, - cfg.dir.clone().into(), - sec_dir.clone().into(), - )); - fs.bootstrap()?; - fs - } else { - panic!() - }; - Engine::open_with(cfg, file_system, vec![]) -} - impl Engine> where F: FileSystem, diff --git a/src/env/hedged/mod.rs b/src/env/hedged/mod.rs index f0f8f6d8..c2821b27 100644 --- a/src/env/hedged/mod.rs +++ b/src/env/hedged/mod.rs @@ -97,7 +97,6 @@ pub use sender::State; /// 2. Raft log is read-only once it's sealed. /// Here is the TLA+ proof https://github.com/pingcap/tla-plus/pull/41 -// TODO: add metrics // TODO: handle specially on config change(upgrade and downgrade) // TODO: fallback to one disk, if the other disk is down for a long time and // close to full diff --git a/src/env/mod.rs b/src/env/mod.rs index 4916c6f2..43650ff2 100644 --- a/src/env/mod.rs +++ b/src/env/mod.rs @@ -21,7 +21,7 @@ pub enum Permission { } /// FileSystem -pub trait FileSystem: Send + Sync + 'static { +pub trait FileSystem: Send + Sync { type Handle: Send + Sync + Handle; type Reader: Seek + Read + Send; type Writer: Seek + Write + Send + WriteExt; diff --git a/src/file_pipe_log/pipe.rs b/src/file_pipe_log/pipe.rs index f4eb2260..5a5916ea 100644 --- a/src/file_pipe_log/pipe.rs +++ b/src/file_pipe_log/pipe.rs @@ -513,11 +513,6 @@ impl PipeLog for DualPipes { queue: LogQueue, bytes: &mut T, ) -> Result { - // if self.file_system.need_recover() { - // self.pipes[LogQueue::Append].rotate(); - // self.pipes[LogQueue::Rewrite].rotate(); - // self.file_system.trigger_recover(); - // } self.pipes[queue as usize].append(bytes) } diff --git a/src/purge.rs b/src/purge.rs index c7f889fc..cb76f776 100644 --- a/src/purge.rs +++ b/src/purge.rs @@ -75,12 +75,6 @@ where pub fn purge_expired_files(&self) -> Result> { let _t = StopWatch::new(&*ENGINE_PURGE_DURATION_HISTOGRAM); - // Purge would delete files, whereas the files may be copied by recovery - // process, so do not purge when recovering. - // if self.file_system().is_in_recover() { - // info!("skip purge due to in recover"); - // return Ok(vec![]); - // } let guard = self.force_rewrite_candidates.try_lock(); if guard.is_none() { warn!("Unable to purge expired files: locked"); From 5c7e7b25c978652fb1077fdb3620a314ba933225 Mon Sep 17 00:00:00 2001 From: Connor1996 Date: Mon, 18 Sep 2023 18:06:34 +0800 Subject: [PATCH 32/32] clean Signed-off-by: Connor1996 --- src/engine.rs | 120 +++++++++++++++++++++++++++++--------------------- 1 file changed, 71 insertions(+), 49 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index b369d5c0..d8f21b41 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -71,7 +71,7 @@ where cfg: Config, file_system: Arc, ) -> Result>> { - Engine::open_with(cfg, file_system, vec![]) + Self::open_with(cfg, file_system, vec![]) } fn open_with( @@ -739,7 +739,8 @@ pub(crate) mod tests { dir: sub_dir.to_str().unwrap().to_owned(), ..Default::default() }; - Engine::open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())).unwrap(); + RaftLogEngine::open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())) + .unwrap(); } #[test] @@ -757,7 +758,7 @@ pub(crate) mod tests { ..Default::default() }; - let engine = Engine::open_with_file_system( + let engine = RaftLogEngine::open_with_file_system( cfg.clone(), Arc::new(ObfuscatedFileSystem::default()), ) @@ -811,7 +812,7 @@ pub(crate) mod tests { target_file_size: ReadableSize(1), ..Default::default() }; - let engine = Engine::open_with_file_system( + let engine = RaftLogEngine::open_with_file_system( cfg.clone(), Arc::new(ObfuscatedFileSystem::default()), ) @@ -885,7 +886,8 @@ pub(crate) mod tests { }; let rid = 1; let engine = - Engine::open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())).unwrap(); + RaftLogEngine::open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())) + .unwrap(); engine .scan_messages::(rid, None, None, false, |_, _| { @@ -973,7 +975,8 @@ pub(crate) mod tests { delete_batch.delete(rid, key.clone()); let engine = - Engine::open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())).unwrap(); + RaftLogEngine::open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())) + .unwrap(); assert_eq!( engine.get_message::(rid, &key).unwrap(), None @@ -1083,7 +1086,8 @@ pub(crate) mod tests { ..Default::default() }; let engine = - Engine::open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())).unwrap(); + RaftLogEngine::open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())) + .unwrap(); let data = vec![b'x'; 1024]; // rewrite:[1 ..10] @@ -1195,7 +1199,8 @@ pub(crate) mod tests { }; let engine = - Engine::open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())).unwrap(); + RaftLogEngine::open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())) + .unwrap(); let data = vec![b'x'; 1024]; for index in 0..100 { engine.append(1, index, index + 1, Some(&data)); @@ -1255,7 +1260,8 @@ pub(crate) mod tests { }; let engine = - Engine::open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())).unwrap(); + RaftLogEngine::open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())) + .unwrap(); let data = vec![b'x'; 1024]; // write 50 small entries into region 1~3, it should trigger force compact. for rid in 1..=3 { @@ -1309,7 +1315,8 @@ pub(crate) mod tests { ..Default::default() }; let engine = - Engine::open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())).unwrap(); + RaftLogEngine::open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())) + .unwrap(); let data = vec![b'x'; 1024]; // Put 100 entries into 10 regions. @@ -1374,7 +1381,8 @@ pub(crate) mod tests { ..Default::default() }; let engine = - Engine::open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())).unwrap(); + RaftLogEngine::open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())) + .unwrap(); let mut log_batch = LogBatch::default(); let empty_entry = Entry::new(); @@ -1433,7 +1441,8 @@ pub(crate) mod tests { ..Default::default() }; let engine = - Engine::open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())).unwrap(); + RaftLogEngine::open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())) + .unwrap(); let data = vec![b'x'; 16]; let cases = [[false, false], [false, true], [true, true]]; for (i, writes) in cases.iter().enumerate() { @@ -1460,7 +1469,8 @@ pub(crate) mod tests { ..Default::default() }; let engine = - Engine::open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())).unwrap(); + RaftLogEngine::open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())) + .unwrap(); let data = vec![b'x'; 1024]; for rid in 1..21 { @@ -1492,7 +1502,8 @@ pub(crate) mod tests { ..Default::default() }; let engine = - Engine::open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())).unwrap(); + RaftLogEngine::open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())) + .unwrap(); let data = vec![b'x'; 2 * 1024 * 1024]; for rid in 1..=3 { @@ -1650,7 +1661,7 @@ pub(crate) mod tests { ..Default::default() }; - let engine = Engine::open_with_file_system(cfg, fs.clone()).unwrap(); + let engine = RaftLogEngine::open_with_file_system(cfg, fs.clone()).unwrap(); for bs in batches.iter_mut() { for batch in bs.iter_mut() { engine.write(batch, false).unwrap(); @@ -1711,7 +1722,7 @@ pub(crate) mod tests { }; let fs = Arc::new(ObfuscatedFileSystem::default()); - let engine = Engine::open_with_file_system(cfg.clone(), fs.clone()).unwrap(); + let engine = RaftLogEngine::open_with_file_system(cfg.clone(), fs.clone()).unwrap(); for rid in 1..=50 { engine.append(rid, 1, 6, Some(&entry_data)); } @@ -1748,7 +1759,7 @@ pub(crate) mod tests { ) .unwrap(); - let engine = Engine::open_with_file_system(cfg, fs).unwrap(); + let engine = RaftLogEngine::open_with_file_system(cfg, fs).unwrap(); for rid in 1..25 { engine.scan_entries(rid, 1, 6, |_, _, d| { assert_eq!(d, &entry_data); @@ -1776,7 +1787,7 @@ pub(crate) mod tests { }; let fs = Arc::new(ObfuscatedFileSystem::default()); - let engine = Engine::open_with_file_system(cfg.clone(), fs.clone()).unwrap(); + let engine = RaftLogEngine::open_with_file_system(cfg.clone(), fs.clone()).unwrap(); for rid in 1..=50 { engine.append(rid, 1, 6, Some(&entry_data)); } @@ -1810,7 +1821,7 @@ pub(crate) mod tests { ) .unwrap(); - let engine = Engine::open_with_file_system(cfg, fs).unwrap(); + let engine = RaftLogEngine::open_with_file_system(cfg, fs).unwrap(); for rid in 1..25 { if existing_emptied.contains(&rid) || incoming_emptied.contains(&rid) { continue; @@ -1857,7 +1868,7 @@ pub(crate) mod tests { }; let fs = Arc::new(ObfuscatedFileSystem::default()); - let engine = Engine::open_with_file_system(cfg.clone(), fs.clone()).unwrap(); + let engine = RaftLogEngine::open_with_file_system(cfg.clone(), fs.clone()).unwrap(); for rid in 1..=50 { engine.append(rid, 1, 6, Some(&entry_data)); } @@ -1878,11 +1889,11 @@ pub(crate) mod tests { // Corrupt a log batch. f.set_len(f.metadata().unwrap().len() - 1).unwrap(); - Engine::open_with_file_system(cfg.clone(), fs.clone()).unwrap(); + RaftLogEngine::open_with_file_system(cfg.clone(), fs.clone()).unwrap(); // Corrupt the file header. f.set_len(1).unwrap(); - Engine::open_with_file_system(cfg, fs).unwrap(); + RaftLogEngine::open_with_file_system(cfg, fs).unwrap(); } #[test] @@ -1899,7 +1910,7 @@ pub(crate) mod tests { }; let fs = Arc::new(ObfuscatedFileSystem::default()); - let engine = Engine::open_with_file_system(cfg.clone(), fs.clone()).unwrap(); + let engine = RaftLogEngine::open_with_file_system(cfg.clone(), fs.clone()).unwrap(); for rid in 1..=10 { engine.append(rid, 1, 11, Some(&entry_data)); } @@ -1907,7 +1918,7 @@ pub(crate) mod tests { assert!(RaftLogEngine::open(cfg.clone()).is_err()); - let engine = Engine::open_with_file_system(cfg, fs).unwrap(); + let engine = RaftLogEngine::open_with_file_system(cfg, fs).unwrap(); for rid in 1..10 { engine.scan_entries(rid, 1, 11, |_, _, d| { assert_eq!(d, &entry_data); @@ -1959,7 +1970,7 @@ pub(crate) mod tests { let fs = Arc::new(ObfuscatedFileSystem::default()); let rid = 1; - let engine = Engine::open_with_file_system(cfg, fs).unwrap(); + let engine = RaftLogEngine::open_with_file_system(cfg, fs).unwrap(); assert!(engine.is_empty()); engine.append(rid, 1, 11, Some(&entry_data)); assert!(!engine.is_empty()); @@ -2096,7 +2107,7 @@ pub(crate) mod tests { ..Default::default() }; let fs = Arc::new(DeleteMonitoredFileSystem::new()); - let engine = Engine::open_with_file_system(cfg, fs.clone()).unwrap(); + let engine = RaftLogEngine::open_with_file_system(cfg, fs.clone()).unwrap(); for rid in 1..=10 { engine.append(rid, 1, 11, Some(&entry_data)); } @@ -2154,7 +2165,7 @@ pub(crate) mod tests { }; let recycle_capacity = cfg.recycle_capacity() as u64; let fs = Arc::new(DeleteMonitoredFileSystem::new()); - let engine = Engine::open_with_file_system(cfg, fs.clone()).unwrap(); + let engine = RaftLogEngine::open_with_file_system(cfg, fs.clone()).unwrap(); let reserved_start = *fs.reserved_metadata.lock().unwrap().first().unwrap(); for rid in 1..=10 { @@ -2262,14 +2273,14 @@ pub(crate) mod tests { assert!(cfg_v2.recycle_capacity() > 0); // Prepare files with format_version V1 { - let engine = Engine::open_with_file_system(cfg_v1.clone(), fs.clone()).unwrap(); + let engine = RaftLogEngine::open_with_file_system(cfg_v1.clone(), fs.clone()).unwrap(); for rid in 1..=10 { engine.append(rid, 1, 11, Some(&entry_data)); } } // Reopen the Engine with V2 and purge { - let engine = Engine::open_with_file_system(cfg_v2.clone(), fs.clone()).unwrap(); + let engine = RaftLogEngine::open_with_file_system(cfg_v2.clone(), fs.clone()).unwrap(); let (start, _) = engine.file_span(LogQueue::Append); for rid in 6..=10 { engine.append(rid, 11, 20, Some(&entry_data)); @@ -2283,7 +2294,7 @@ pub(crate) mod tests { } // Reopen the Engine with V1 -> V2 and purge { - let engine = Engine::open_with_file_system(cfg_v1, fs.clone()).unwrap(); + let engine = RaftLogEngine::open_with_file_system(cfg_v1, fs.clone()).unwrap(); let (start, _) = engine.file_span(LogQueue::Append); for rid in 6..=10 { engine.append(rid, 20, 30, Some(&entry_data)); @@ -2297,7 +2308,7 @@ pub(crate) mod tests { assert_eq!(engine.file_span(LogQueue::Append).0, start); let file_count = engine.file_count(Some(LogQueue::Append)); drop(engine); - let engine = Engine::open_with_file_system(cfg_v2, fs).unwrap(); + let engine = RaftLogEngine::open_with_file_system(cfg_v2, fs).unwrap(); assert_eq!(engine.file_span(LogQueue::Append).0, start); assert_eq!(engine.file_count(Some(LogQueue::Append)), file_count); // Mark all regions obsolete. @@ -2328,7 +2339,7 @@ pub(crate) mod tests { enable_log_recycle: false, ..Default::default() }; - let engine = Engine::open_with_file_system(cfg, file_system.clone()).unwrap(); + let engine = RaftLogEngine::open_with_file_system(cfg, file_system.clone()).unwrap(); let (start, _) = engine.file_span(LogQueue::Append); // Only one valid file left, the last one => active_file. assert_eq!(engine.file_count(Some(LogQueue::Append)), 1); @@ -2350,7 +2361,8 @@ pub(crate) mod tests { prefill_for_recycle: true, ..Default::default() }; - let engine = Engine::open_with_file_system(cfg.clone(), file_system.clone()).unwrap(); + let engine = + RaftLogEngine::open_with_file_system(cfg.clone(), file_system.clone()).unwrap(); let (start, end) = engine.file_span(LogQueue::Append); // Only one valid file left, the last one => active_file. assert_eq!(start, end); @@ -2373,7 +2385,8 @@ pub(crate) mod tests { purge_threshold: ReadableSize(50), ..cfg }; - let engine = Engine::open_with_file_system(cfg_v2.clone(), file_system.clone()).unwrap(); + let engine = + RaftLogEngine::open_with_file_system(cfg_v2.clone(), file_system.clone()).unwrap(); assert_eq!(engine.file_span(LogQueue::Append), (start, end)); assert!(recycled_count > file_system.inner.file_count() - engine.file_count(None)); // Recycled files have filled the LogQueue::Append, purge_expired_files won't @@ -2397,7 +2410,7 @@ pub(crate) mod tests { prefill_for_recycle: false, ..cfg_v2 }; - let engine = Engine::open_with_file_system(cfg_v3, file_system.clone()).unwrap(); + let engine = RaftLogEngine::open_with_file_system(cfg_v3, file_system.clone()).unwrap(); assert_eq!(file_system.inner.file_count(), engine.file_count(None)); } @@ -2418,7 +2431,7 @@ pub(crate) mod tests { let key = vec![b'x'; 2]; let value = vec![b'y'; 8]; - let engine = Engine::open_with_file_system(cfg, fs).unwrap(); + let engine = RaftLogEngine::open_with_file_system(cfg, fs).unwrap(); let mut data = HashSet::new(); let mut rid = 1; // Directly write to pipe log. @@ -2585,7 +2598,7 @@ pub(crate) mod tests { ..Default::default() }; let fs = Arc::new(ObfuscatedFileSystem::default()); - let engine = Engine::open_with_file_system(cfg, fs).unwrap(); + let engine = RaftLogEngine::open_with_file_system(cfg, fs).unwrap(); let value = vec![b'y'; 8]; let mut log_batch = LogBatch::default(); log_batch.put_unchecked(1, crate::make_internal_key(&[1]), value.clone()); @@ -2685,7 +2698,8 @@ pub(crate) mod tests { }; // Step 1: write data into the main directory. - let engine = Engine::open_with_file_system(cfg.clone(), file_system.clone()).unwrap(); + let engine = + RaftLogEngine::open_with_file_system(cfg.clone(), file_system.clone()).unwrap(); for rid in 1..=10 { engine.append(rid, 1, 10, Some(&entry_data)); } @@ -2699,7 +2713,7 @@ pub(crate) mod tests { purge_threshold: ReadableSize(40), ..cfg }; - let engine = Engine::open_with_file_system(cfg_2, file_system).unwrap(); + let engine = RaftLogEngine::open_with_file_system(cfg_2, file_system).unwrap(); assert_eq!(number_of_files(sec_dir.path()), number_of_files(dir.path())); for rid in 1..=10 { assert_eq!(engine.first_index(rid).unwrap(), 1); @@ -2748,7 +2762,8 @@ pub(crate) mod tests { }; // Step 1: write data into the main directory. - let engine = Engine::open_with_file_system(cfg.clone(), file_system.clone()).unwrap(); + let engine = + RaftLogEngine::open_with_file_system(cfg.clone(), file_system.clone()).unwrap(); for rid in 1..=10 { engine.append(rid, 1, 10, Some(&entry_data)); } @@ -2774,7 +2789,8 @@ pub(crate) mod tests { // abnormal case - Empty second dir { std::fs::remove_dir_all(sec_dir.path()).unwrap(); - let engine = Engine::open_with_file_system(cfg.clone(), file_system.clone()).unwrap(); + let engine = + RaftLogEngine::open_with_file_system(cfg.clone(), file_system.clone()).unwrap(); // All files in first dir are copied to second dir assert_eq!(number_of_files(sec_dir.path()), number_of_files(dir.path())); assert_eq!(calculate_hash(sec_dir.path()), calculate_hash(dir.path())); @@ -2796,7 +2812,8 @@ pub(crate) mod tests { file_count += 1; } } - let engine = Engine::open_with_file_system(cfg.clone(), file_system.clone()).unwrap(); + let engine = + RaftLogEngine::open_with_file_system(cfg.clone(), file_system.clone()).unwrap(); // Missing append files are copied assert_eq!(number_of_files(sec_dir.path()), number_of_files(dir.path())); assert_eq!(calculate_hash(sec_dir.path()), calculate_hash(dir.path())); @@ -2818,7 +2835,8 @@ pub(crate) mod tests { file_count += 1; } } - let engine = Engine::open_with_file_system(cfg.clone(), file_system.clone()).unwrap(); + let engine = + RaftLogEngine::open_with_file_system(cfg.clone(), file_system.clone()).unwrap(); // Missing rewrite files are copied assert_eq!(number_of_files(sec_dir.path()), number_of_files(dir.path())); assert_eq!(calculate_hash(sec_dir.path()), calculate_hash(dir.path())); @@ -2836,7 +2854,8 @@ pub(crate) mod tests { file_count += 1; } } - let engine = Engine::open_with_file_system(cfg.clone(), file_system.clone()).unwrap(); + let engine = + RaftLogEngine::open_with_file_system(cfg.clone(), file_system.clone()).unwrap(); // Missing reserve files are copied assert_eq!(number_of_files(sec_dir.path()), number_of_files(dir.path())); assert_eq!(calculate_hash(sec_dir.path()), calculate_hash(dir.path())); @@ -2855,7 +2874,8 @@ pub(crate) mod tests { .unwrap(); } } - let engine = Engine::open_with_file_system(cfg.clone(), file_system.clone()).unwrap(); + let engine = + RaftLogEngine::open_with_file_system(cfg.clone(), file_system.clone()).unwrap(); // Extra files are untouched. assert_ne!(number_of_files(sec_dir.path()), number_of_files(dir.path())); assert_eq!(calculate_hash(sec_dir.path()), calculate_hash(dir.path())); @@ -2874,7 +2894,8 @@ pub(crate) mod tests { f.write_all(b"corrupted").unwrap(); } } - let engine = Engine::open_with_file_system(cfg.clone(), file_system.clone()).unwrap(); + let engine = + RaftLogEngine::open_with_file_system(cfg.clone(), file_system.clone()).unwrap(); // Corrupted files are untouched. assert_ne!(number_of_files(sec_dir.path()), number_of_files(dir.path())); assert_eq!(calculate_hash(sec_dir.path()), calculate_hash(dir.path())); @@ -2911,7 +2932,8 @@ pub(crate) mod tests { }; { // Step 1: write data into the main directory. - let engine = Engine::open_with_file_system(cfg.clone(), file_system.clone()).unwrap(); + let engine = + RaftLogEngine::open_with_file_system(cfg.clone(), file_system.clone()).unwrap(); for rid in 1..=10 { engine.append(rid, 1, 10, Some(&entry_data)); } @@ -2948,7 +2970,7 @@ pub(crate) mod tests { ..cfg.clone() }; let recycle_capacity = cfg_2.recycle_capacity() as u64; - let engine = Engine::open_with_file_system(cfg_2, file_system.clone()).unwrap(); + let engine = RaftLogEngine::open_with_file_system(cfg_2, file_system.clone()).unwrap(); assert!(number_of_files(spill_dir.path()) > 0); for rid in 1..=10 { assert_eq!(engine.first_index(rid).unwrap(), 1); @@ -2975,7 +2997,7 @@ pub(crate) mod tests { ..cfg }; drop(engine); - let engine = Engine::open_with_file_system(cfg_3, file_system).unwrap(); + let engine = RaftLogEngine::open_with_file_system(cfg_3, file_system).unwrap(); assert!(number_of_files(spill_dir.path()) > 0); for rid in 1..=10 { assert_eq!(engine.first_index(rid).unwrap(), 20);