From 8caec186c1a65e95c410dac1ef5dfee32dceb182 Mon Sep 17 00:00:00 2001 From: Yuya Nishihara Date: Mon, 2 Dec 2024 19:39:25 +0900 Subject: [PATCH] local_working_copy: filter deleted files per directory (or job) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This greatly reduces the amount of paths to be sent over the channel and the strings to be hashed. Benchmark: 1. original (omitted) 2. per-directory spawn (previous patch) 3. per-directory deleted files (this patch) 4. shorter path comparison (omitted) gecko-dev (~357k files, ~25k dirs) ``` % JJ_CONFIG=/dev/null hyperfine --sort command --warmup 3 --runs 30 .. Benchmark 2: target/release-with-debug/jj-2 -R ~/mirrors/gecko-dev debug snapshot Time (mean ± σ): 710.7 ms ± 9.1 ms [User: 3070.7 ms, System: 2142.6 ms] Range (min … max): 695.9 ms … 740.1 ms 30 runs Benchmark 3: target/release-with-debug/jj-3 -R ~/mirrors/gecko-dev debug snapshot Time (mean ± σ): 480.1 ms ± 8.8 ms [User: 3190.5 ms, System: 2127.2 ms] Range (min … max): 471.2 ms … 509.8 ms 30 runs Relative speed comparison 1.76 ± 0.03 target/release-with-debug/jj-2 -R ~/mirrors/gecko-dev debug snapshot 1.19 ± 0.03 target/release-with-debug/jj-3 -R ~/mirrors/gecko-dev debug snapshot ``` linux (~87k files, ~6k dirs) ``` % JJ_CONFIG=/dev/null hyperfine --sort command --warmup 3 --runs 30 .. Benchmark 2: target/release-with-debug/jj-2 -R ~/mirrors/linux debug snapshot Time (mean ± σ): 242.3 ms ± 3.3 ms [User: 656.8 ms, System: 538.0 ms] Range (min … max): 236.9 ms … 252.3 ms 30 runs Benchmark 3: target/release-with-debug/jj-3 -R ~/mirrors/linux debug snapshot Time (mean ± σ): 204.2 ms ± 3.0 ms [User: 667.3 ms, System: 545.6 ms] Range (min … max): 197.1 ms … 209.2 ms 30 runs Relative speed comparison 1.27 ± 0.03 target/release-with-debug/jj-2 -R ~/mirrors/linux debug snapshot 1.07 ± 0.02 target/release-with-debug/jj-3 -R ~/mirrors/linux debug snapshot ``` nixpkgs (~45k files, ~31k dirs) ``` % JJ_CONFIG=/dev/null hyperfine --sort command --warmup 3 --runs 30 .. Benchmark 2: target/release-with-debug/jj-2 -R ~/mirrors/nixpkgs debug snapshot Time (mean ± σ): 190.7 ms ± 4.1 ms [User: 859.3 ms, System: 881.1 ms] Range (min … max): 184.6 ms … 202.4 ms 30 runs Benchmark 3: target/release-with-debug/jj-3 -R ~/mirrors/nixpkgs debug snapshot Time (mean ± σ): 173.3 ms ± 6.7 ms [User: 899.4 ms, System: 889.0 ms] Range (min … max): 166.5 ms … 197.9 ms 30 runs Relative speed comparison 1.18 ± 0.03 target/release-with-debug/jj-2 -R ~/mirrors/nixpkgs debug snapshot 1.07 ± 0.04 target/release-with-debug/jj-3 -R ~/mirrors/nixpkgs debug snapshot ``` git (~4.5k files, 0.2k dirs) ``` % JJ_CONFIG=/dev/null hyperfine --sort command --warmup 30 --runs 50 .. Benchmark 2: target/release-with-debug/jj-2 -R ~/mirrors/git debug snapshot Time (mean ± σ): 30.6 ms ± 1.1 ms [User: 33.8 ms, System: 39.0 ms] Range (min … max): 29.0 ms … 35.0 ms 50 runs Benchmark 3: target/release-with-debug/jj-3 -R ~/mirrors/git debug snapshot Time (mean ± σ): 28.8 ms ± 1.0 ms [User: 33.0 ms, System: 37.6 ms] Range (min … max): 26.8 ms … 31.3 ms 50 runs Relative speed comparison 1.06 ± 0.05 target/release-with-debug/jj-2 -R ~/mirrors/git debug snapshot 1.00 target/release-with-debug/jj-3 -R ~/mirrors/git debug snapshot ``` --- lib/src/local_working_copy.rs | 130 ++++++++++++++++++--------- lib/tests/test_local_working_copy.rs | 49 ++++++++++ 2 files changed, 137 insertions(+), 42 deletions(-) diff --git a/lib/src/local_working_copy.rs b/lib/src/local_working_copy.rs index ec1cb5e96d..da6a2fb38e 100644 --- a/lib/src/local_working_copy.rs +++ b/lib/src/local_working_copy.rs @@ -40,6 +40,7 @@ use std::sync::Arc; use std::sync::OnceLock; use std::time::UNIX_EPOCH; +use either::Either; use futures::StreamExt; use itertools::EitherOrBoth; use itertools::Itertools; @@ -883,14 +884,14 @@ impl TreeState { let matcher = IntersectionMatcher::new(sparse_matcher.as_ref(), fsmonitor_matcher); if matcher.visit(RepoPath::root()).is_nothing() { - // No need to iterate file states to build empty deleted_files. + // No need to load the current tree, set up channels, etc. self.watchman_clock = watchman_clock; return Ok(is_dirty); } let (tree_entries_tx, tree_entries_rx) = channel(); let (file_states_tx, file_states_rx) = channel(); - let (present_files_tx, present_files_rx) = channel(); + let (deleted_files_tx, deleted_files_rx) = channel(); trace_span!("traverse filesystem").in_scope(|| -> Result<(), SnapshotError> { let snapshotter = FileSnapshotter { @@ -901,7 +902,7 @@ impl TreeState { // Move tx sides so they'll be dropped at the end of the scope. tree_entries_tx, file_states_tx, - present_files_tx, + deleted_files_tx, error: OnceLock::new(), progress, max_new_file_size, @@ -923,34 +924,18 @@ impl TreeState { })?; let mut tree_builder = MergedTreeBuilder::new(self.tree_id.clone()); - let mut deleted_files: HashSet<_> = - trace_span!("collecting existing files").in_scope(|| { - // Since file_states shouldn't contain files excluded by the sparse patterns, - // fsmonitor_matcher here is identical to the intersected matcher. - let file_states = self.file_states.all(); - file_states - .iter() - .filter(|(path, state)| { - fsmonitor_matcher.matches(path) && state.file_type != FileType::GitSubmodule - }) - .map(|(path, _state)| path.to_owned()) - .collect() - }); trace_span!("process tree entries").in_scope(|| { for (path, tree_values) in &tree_entries_rx { tree_builder.set_or_remove(path, tree_values); } }); - trace_span!("process present files").in_scope(|| { - for path in &present_files_rx { - deleted_files.remove(&path); - } - }); - trace_span!("process deleted tree entries").in_scope(|| { + let deleted_files = trace_span!("process deleted tree entries").in_scope(|| { + let deleted_files = HashSet::from_iter(deleted_files_rx); is_dirty |= !deleted_files.is_empty(); for file in &deleted_files { tree_builder.set_or_remove(file.clone(), Merge::absent()); } + deleted_files }); trace_span!("process file states").in_scope(|| { let changed_file_states = file_states_rx @@ -1033,6 +1018,18 @@ struct DirectoryToVisit<'a> { file_states: FileStates<'a>, } +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +enum PresentDirEntryKind { + Dir, + File, +} + +#[derive(Clone, Debug)] +struct PresentDirEntries { + dirs: HashSet, + files: HashSet, +} + /// Helper to scan local-disk directories and files in parallel. struct FileSnapshotter<'a> { tree_state: &'a TreeState, @@ -1041,7 +1038,7 @@ struct FileSnapshotter<'a> { start_tracking_matcher: &'a dyn Matcher, tree_entries_tx: Sender<(RepoPathBuf, MergedTreeValue)>, file_states_tx: Sender<(RepoPathBuf, FileState)>, - present_files_tx: Sender, + deleted_files_tx: Sender, error: OnceLock, progress: Option<&'a SnapshotProgress<'a>>, max_new_file_size: u64, @@ -1095,14 +1092,23 @@ impl FileSnapshotter<'_> { message: format!("Failed to read directory {}", disk_dir.display()), err: err.into(), })?; - dir_entries + let (dirs, files) = dir_entries .into_par_iter() // Don't split into too many small jobs. For a small directory, // sequential scan should be fast enough. .with_min_len(100) - .try_for_each(|entry| { + .filter_map(|entry| { self.process_dir_entry(&dir, &git_ignore, file_states, &entry, scope) - })?; + .transpose() + }) + .map(|item| match item { + Ok((PresentDirEntryKind::Dir, name)) => Ok(Either::Left(name)), + Ok((PresentDirEntryKind::File, name)) => Ok(Either::Right(name)), + Err(err) => Err(err), + }) + .collect::>()?; + let present_entries = PresentDirEntries { dirs, files }; + self.emit_deleted_files(&dir, file_states, &present_entries); Ok(()) } @@ -1113,23 +1119,21 @@ impl FileSnapshotter<'_> { file_states: FileStates<'scope>, entry: &DirEntry, scope: &rayon::Scope<'scope>, - ) -> Result<(), SnapshotError> { + ) -> Result, SnapshotError> { let file_type = entry.file_type().unwrap(); let file_name = entry.file_name(); let name = file_name - .to_str() - .ok_or_else(|| SnapshotError::InvalidUtf8Path { - path: file_name.clone(), - })?; + .into_string() + .map_err(|path| SnapshotError::InvalidUtf8Path { path })?; - if RESERVED_DIR_NAMES.contains(&name) { - return Ok(()); + if RESERVED_DIR_NAMES.contains(&name.as_str()) { + return Ok(None); } - let path = dir.join(RepoPathComponent::new(name)); + let path = dir.join(RepoPathComponent::new(&name)); let maybe_current_file_state = file_states.get(&path); if let Some(file_state) = &maybe_current_file_state { if file_state.file_type == FileType::GitSubmodule { - return Ok(()); + return Ok(None); } } @@ -1155,6 +1159,9 @@ impl FileSnapshotter<'_> { self.visit_directory(directory_to_visit, scope) }); } + // Whether or not the directory path matches, any child file entries + // shouldn't be touched within the current recursion step. + Ok(Some((PresentDirEntryKind::Dir, name))) } else if self.matcher.matches(&path) { if let Some(progress) = self.progress { progress(&path); @@ -1164,11 +1171,13 @@ impl FileSnapshotter<'_> { { // If it wasn't already tracked and it matches // the ignored paths, then ignore it. + Ok(None) } else if maybe_current_file_state.is_none() && !self.start_tracking_matcher.matches(&path) { // Leave the file untracked // TODO: Report this path to the caller + Ok(None) } else { let metadata = entry.metadata().map_err(|err| SnapshotError::Other { message: format!("Failed to stat file {}", entry.path().display()), @@ -1189,10 +1198,15 @@ impl FileSnapshotter<'_> { maybe_current_file_state.as_ref(), new_file_state, )?; + Ok(Some((PresentDirEntryKind::File, name))) + } else { + // Special file is not considered present + Ok(None) } } + } else { + Ok(None) } - Ok(()) } /// Visits only paths we're already tracking. @@ -1203,10 +1217,8 @@ impl FileSnapshotter<'_> { } let disk_path = tracked_path.to_fs_path(&self.tree_state.working_copy_path)?; let metadata = match disk_path.symlink_metadata() { - Ok(metadata) => metadata, - Err(err) if err.kind() == io::ErrorKind::NotFound => { - continue; - } + Ok(metadata) => Some(metadata), + Err(err) if err.kind() == io::ErrorKind::NotFound => None, Err(err) => { return Err(SnapshotError::Other { message: format!("Failed to stat file {}", disk_path.display()), @@ -1214,13 +1226,15 @@ impl FileSnapshotter<'_> { }); } }; - if let Some(new_file_state) = file_state(&metadata) { + if let Some(new_file_state) = metadata.as_ref().and_then(file_state) { self.process_present_file( tracked_path.to_owned(), &disk_path, Some(¤t_file_state), new_file_state, )?; + } else { + self.deleted_files_tx.send(tracked_path.to_owned()).ok(); } } Ok(()) @@ -1233,7 +1247,6 @@ impl FileSnapshotter<'_> { maybe_current_file_state: Option<&FileState>, new_file_state: FileState, ) -> Result<(), SnapshotError> { - self.present_files_tx.send(path.clone()).ok(); let update = self.get_updated_tree_value( &path, disk_path, @@ -1249,6 +1262,39 @@ impl FileSnapshotter<'_> { Ok(()) } + /// Emits file paths that don't exist in the `present_entries`. + fn emit_deleted_files( + &self, + dir: &RepoPath, + file_states: FileStates<'_>, + present_entries: &PresentDirEntries, + ) { + let file_state_chunks = file_states.iter().chunk_by(|(path, _state)| { + // Extract from , /, or //**. + // (file_states may contain file on file->dir transition.) + debug_assert!(path.starts_with(dir)); + let slash = !dir.is_root() as usize; + let len = dir.as_internal_file_string().len() + slash; + let tail = path.as_internal_file_string().get(len..).unwrap_or(""); + match tail.split_once('/') { + Some((name, _)) => (PresentDirEntryKind::Dir, name), + None => (PresentDirEntryKind::File, tail), + } + }); + file_state_chunks + .into_iter() + .filter(|&((kind, name), _)| match kind { + PresentDirEntryKind::Dir => !present_entries.dirs.contains(name), + PresentDirEntryKind::File => !present_entries.files.contains(name), + }) + .flat_map(|(_, chunk)| chunk) + // Whether or not the entry exists, submodule should be ignored + .filter(|(_, state)| state.file_type != FileType::GitSubmodule) + .filter(|(path, _)| self.matcher.matches(path)) + .try_for_each(|(path, _)| self.deleted_files_tx.send(path.to_owned())) + .ok(); + } + fn get_updated_tree_value( &self, repo_path: &RepoPath, diff --git a/lib/tests/test_local_working_copy.rs b/lib/tests/test_local_working_copy.rs index 3f739923b4..610de0ed96 100644 --- a/lib/tests/test_local_working_copy.rs +++ b/lib/tests/test_local_working_copy.rs @@ -750,6 +750,55 @@ fn test_checkout_discard() { assert!(!reloaded_wc.file_states().unwrap().contains_path(file2_path)); } +#[test] +fn test_snapshot_file_directory_transition() { + let settings = testutils::user_settings(); + let mut test_workspace = TestWorkspace::init(&settings); + let repo = test_workspace.repo.clone(); + let workspace_root = test_workspace.workspace.workspace_root().to_owned(); + let checkout_options = CheckoutOptions::empty_for_test(); + let to_ws_path = |path: &RepoPath| path.to_fs_path(&workspace_root).unwrap(); + + // file <-> directory transition at root and sub directories + let file1_path = RepoPath::from_internal_string("foo/bar"); + let file2_path = RepoPath::from_internal_string("sub/bar/baz"); + let file1p_path = file1_path.parent().unwrap(); + let file2p_path = file2_path.parent().unwrap(); + + let tree1 = create_tree(&repo, &[(file1p_path, "1p"), (file2p_path, "2p")]); + let tree2 = create_tree(&repo, &[(file1_path, "1"), (file2_path, "2")]); + let commit1 = commit_with_tree(repo.store(), tree1.id()); + let commit2 = commit_with_tree(repo.store(), tree2.id()); + + let ws = &mut test_workspace.workspace; + ws.check_out(repo.op_id().clone(), None, &commit1, &checkout_options) + .unwrap(); + + // file -> directory + std::fs::remove_file(to_ws_path(file1p_path)).unwrap(); + std::fs::remove_file(to_ws_path(file2p_path)).unwrap(); + std::fs::create_dir(to_ws_path(file1p_path)).unwrap(); + std::fs::create_dir(to_ws_path(file2p_path)).unwrap(); + std::fs::write(to_ws_path(file1_path), "1").unwrap(); + std::fs::write(to_ws_path(file2_path), "2").unwrap(); + let new_tree = test_workspace.snapshot().unwrap(); + assert_eq!(new_tree.id(), tree2.id()); + + let ws = &mut test_workspace.workspace; + ws.check_out(repo.op_id().clone(), None, &commit2, &checkout_options) + .unwrap(); + + // directory -> file + std::fs::remove_file(to_ws_path(file1_path)).unwrap(); + std::fs::remove_file(to_ws_path(file2_path)).unwrap(); + std::fs::remove_dir(to_ws_path(file1p_path)).unwrap(); + std::fs::remove_dir(to_ws_path(file2p_path)).unwrap(); + std::fs::write(to_ws_path(file1p_path), "1p").unwrap(); + std::fs::write(to_ws_path(file2p_path), "2p").unwrap(); + let new_tree = test_workspace.snapshot().unwrap(); + assert_eq!(new_tree.id(), tree1.id()); +} + #[test] fn test_materialize_snapshot_conflicted_files() { let settings = testutils::user_settings();