From 45c80c2a5f0be53c1dc295e7540ce4b2cc2364af Mon Sep 17 00:00:00 2001 From: Yuya Nishihara Date: Thu, 5 Dec 2024 18:02:19 +0900 Subject: [PATCH] local_working_copy: optimize path comparison in prefixed file states MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since all entries in filtered file states share the same directory prefix, we don't need to compare full file paths. The added functions take (path, name) instead of (path, sub_path) because the comparison can be slightly faster if the name is guaranteed to be a single path component. Benchmark: 1. original (omitted) 2. per-directory spawn (omitted) 3. per-directory deleted files (previous patch) 4. shorter path comparison (this patch) gecko-dev (~357k files, ~25k dirs) ``` % JJ_CONFIG=/dev/null hyperfine --sort command --warmup 3 --runs 30 .. Benchmark 3: target/release-with-debug/jj-3 -R ~/mirrors/gecko-dev debug snapshot Time (mean ± σ): 480.1 ms ± 8.8 ms [User: 3190.5 ms, System: 2127.2 ms] Range (min … max): 471.2 ms … 509.8 ms 30 runs Benchmark 4: target/release-with-debug/jj-4 -R ~/mirrors/gecko-dev debug snapshot Time (mean ± σ): 404.0 ms ± 4.4 ms [User: 1933.4 ms, System: 2148.8 ms] Range (min … max): 396.4 ms … 416.9 ms 30 runs Relative speed comparison 1.19 ± 0.03 target/release-with-debug/jj-3 -R ~/mirrors/gecko-dev debug snapshot 1.00 target/release-with-debug/jj-4 -R ~/mirrors/gecko-dev debug snapshot ``` linux (~87k files, ~6k dirs) ``` % JJ_CONFIG=/dev/null hyperfine --sort command --warmup 3 --runs 30 .. Benchmark 3: target/release-with-debug/jj-3 -R ~/mirrors/linux debug snapshot Time (mean ± σ): 204.2 ms ± 3.0 ms [User: 667.3 ms, System: 545.6 ms] Range (min … max): 197.1 ms … 209.2 ms 30 runs Benchmark 4: target/release-with-debug/jj-4 -R ~/mirrors/linux debug snapshot Time (mean ± σ): 191.3 ms ± 3.3 ms [User: 467.4 ms, System: 542.2 ms] Range (min … max): 186.1 ms … 200.6 ms 30 runs Relative speed comparison 1.07 ± 0.02 target/release-with-debug/jj-3 -R ~/mirrors/linux debug snapshot 1.00 target/release-with-debug/jj-4 -R ~/mirrors/linux debug snapshot ``` nixpkgs (~45k files, ~31k dirs) ``` % JJ_CONFIG=/dev/null hyperfine --sort command --warmup 3 --runs 30 .. Benchmark 3: target/release-with-debug/jj-3 -R ~/mirrors/nixpkgs debug snapshot Time (mean ± σ): 173.3 ms ± 6.7 ms [User: 899.4 ms, System: 889.0 ms] Range (min … max): 166.5 ms … 197.9 ms 30 runs Benchmark 4: target/release-with-debug/jj-4 -R ~/mirrors/nixpkgs debug snapshot Time (mean ± σ): 161.7 ms ± 2.5 ms [User: 739.1 ms, System: 881.7 ms] Range (min … max): 156.5 ms … 166.4 ms 30 runs Relative speed comparison 1.07 ± 0.04 target/release-with-debug/jj-3 -R ~/mirrors/nixpkgs debug snapshot 1.00 target/release-with-debug/jj-4 -R ~/mirrors/nixpkgs debug snapshot ``` git (~4.5k files, 0.2k dirs) ``` % JJ_CONFIG=/dev/null hyperfine --sort command --warmup 30 --runs 50 .. Benchmark 3: target/release-with-debug/jj-3 -R ~/mirrors/git debug snapshot Time (mean ± σ): 28.8 ms ± 1.0 ms [User: 33.0 ms, System: 37.6 ms] Range (min … max): 26.8 ms … 31.3 ms 50 runs Benchmark 4: target/release-with-debug/jj-4 -R ~/mirrors/git debug snapshot Time (mean ± σ): 28.8 ms ± 1.9 ms [User: 30.3 ms, System: 36.5 ms] Range (min … max): 26.0 ms … 39.2 ms 50 runs Relative speed comparison 1.00 target/release-with-debug/jj-3 -R ~/mirrors/git debug snapshot 1.00 ± 0.08 target/release-with-debug/jj-4 -R ~/mirrors/git debug snapshot ``` --- lib/src/local_working_copy.rs | 141 ++++++++++++++++++++++++++++++++-- 1 file changed, 134 insertions(+), 7 deletions(-) diff --git a/lib/src/local_working_copy.rs b/lib/src/local_working_copy.rs index da6a2fb38e..177176cd74 100644 --- a/lib/src/local_working_copy.rs +++ b/lib/src/local_working_copy.rs @@ -16,6 +16,7 @@ #![allow(clippy::let_unit_value)] use std::any::Any; +use std::cmp::Ordering; use std::collections::HashSet; use std::error::Error; use std::fs; @@ -272,6 +273,13 @@ impl<'a> FileStates<'a> { Self::from_sorted(&self.data[range]) } + /// Faster version of `prefixed("/")`. Requires that all entries + /// share the same prefix `dir`. + fn prefixed_at(&self, dir: &RepoPath, base: &RepoPathComponent) -> Self { + let range = self.prefixed_range_at(dir, base); + Self::from_sorted(&self.data[range]) + } + /// Returns true if this contains no entries. pub fn is_empty(&self) -> bool { self.data.is_empty() @@ -289,12 +297,36 @@ impl<'a> FileStates<'a> { Some(state) } + /// Faster version of `get("/")`. Requires that all entries share + /// the same prefix `dir`. + fn get_at(&self, dir: &RepoPath, name: &RepoPathComponent) -> Option { + let pos = self.exact_position_at(dir, name)?; + let (_, state) = file_state_entry_from_proto(&self.data[pos]); + Some(state) + } + fn exact_position(&self, path: &RepoPath) -> Option { self.data .binary_search_by(|entry| RepoPath::from_internal_string(&entry.path).cmp(path)) .ok() } + fn exact_position_at(&self, dir: &RepoPath, name: &RepoPathComponent) -> Option { + debug_assert!(self.paths().all(|path| path.starts_with(dir))); + let slash_len = !dir.is_root() as usize; + let prefix_len = dir.as_internal_file_string().len() + slash_len; + self.data + .binary_search_by(|entry| { + let tail = entry.path.get(prefix_len..).unwrap_or(""); + match tail.split_once('/') { + // "/*" > "" + Some((pre, _)) => pre.cmp(name.as_internal_str()).then(Ordering::Greater), + None => tail.cmp(name.as_internal_str()), + } + }) + .ok() + } + fn prefixed_range(&self, base: &RepoPath) -> Range { let start = self .data @@ -304,6 +336,23 @@ impl<'a> FileStates<'a> { start..(start + len) } + fn prefixed_range_at(&self, dir: &RepoPath, base: &RepoPathComponent) -> Range { + debug_assert!(self.paths().all(|path| path.starts_with(dir))); + let slash_len = !dir.is_root() as usize; + let prefix_len = dir.as_internal_file_string().len() + slash_len; + let start = self.data.partition_point(|entry| { + let tail = entry.path.get(prefix_len..).unwrap_or(""); + let entry_name = tail.split_once('/').map_or(tail, |(name, _)| name); + entry_name < base.as_internal_str() + }); + let len = self.data[start..].partition_point(|entry| { + let tail = entry.path.get(prefix_len..).unwrap_or(""); + let entry_name = tail.split_once('/').map_or(tail, |(name, _)| name); + entry_name == base.as_internal_str() + }); + start..(start + len) + } + /// Iterates file state entries sorted by path. pub fn iter(&self) -> FileStatesIter<'a> { self.data.iter().map(file_state_entry_from_proto) @@ -1122,15 +1171,16 @@ impl FileSnapshotter<'_> { ) -> Result, SnapshotError> { let file_type = entry.file_type().unwrap(); let file_name = entry.file_name(); - let name = file_name + let name_string = file_name .into_string() .map_err(|path| SnapshotError::InvalidUtf8Path { path })?; - if RESERVED_DIR_NAMES.contains(&name.as_str()) { + if RESERVED_DIR_NAMES.contains(&name_string.as_str()) { return Ok(None); } - let path = dir.join(RepoPathComponent::new(&name)); - let maybe_current_file_state = file_states.get(&path); + let name = RepoPathComponent::new(&name_string); + let path = dir.join(name); + let maybe_current_file_state = file_states.get_at(dir, name); if let Some(file_state) = &maybe_current_file_state { if file_state.file_type == FileType::GitSubmodule { return Ok(None); @@ -1138,7 +1188,7 @@ impl FileSnapshotter<'_> { } if file_type.is_dir() { - let file_states = file_states.prefixed(&path); + let file_states = file_states.prefixed_at(dir, name); if git_ignore.matches(&path.to_internal_dir_string()) || self.start_tracking_matcher.visit(&path).is_nothing() { @@ -1161,7 +1211,7 @@ impl FileSnapshotter<'_> { } // Whether or not the directory path matches, any child file entries // shouldn't be touched within the current recursion step. - Ok(Some((PresentDirEntryKind::Dir, name))) + Ok(Some((PresentDirEntryKind::Dir, name_string))) } else if self.matcher.matches(&path) { if let Some(progress) = self.progress { progress(&path); @@ -1198,7 +1248,7 @@ impl FileSnapshotter<'_> { maybe_current_file_state.as_ref(), new_file_state, )?; - Ok(Some((PresentDirEntryKind::File, name))) + Ok(Some((PresentDirEntryKind::File, name_string))) } else { // Special file is not considered present Ok(None) @@ -2349,4 +2399,81 @@ mod tests { assert_eq!(file_states.get(repo_path("bc")), Some(new_state(5))); assert_eq!(file_states.get(repo_path("z")), None); } + + #[test] + fn test_file_states_lookup_at() { + let new_state = |size| FileState { + file_type: FileType::Normal { + executable: FileExecutableFlag::default(), + }, + mtime: MillisSinceEpoch(0), + size, + }; + let new_proto_entry = |path: &str, size| { + file_state_entry_to_proto(repo_path(path).to_owned(), &new_state(size)) + }; + let data = vec![ + new_proto_entry("b/c", 0), + new_proto_entry("b/d/e", 1), + new_proto_entry("b/d#", 2), // '#' < '/' + new_proto_entry("b/e", 3), + new_proto_entry("b#", 4), // '#' < '/' + ]; + let file_states = FileStates::from_sorted(&data); + + // At root + assert_eq!( + file_states.get_at(RepoPath::root(), RepoPathComponent::new("b")), + None + ); + assert_eq!( + file_states.get_at(RepoPath::root(), RepoPathComponent::new("b#")), + Some(new_state(4)) + ); + + // At prefixed dir + let prefixed_states = + file_states.prefixed_at(RepoPath::root(), RepoPathComponent::new("b")); + assert_eq!( + prefixed_states.paths().collect_vec(), + ["b/c", "b/d/e", "b/d#", "b/e"].map(repo_path) + ); + assert_eq!( + prefixed_states.get_at(repo_path("b"), RepoPathComponent::new("c")), + Some(new_state(0)) + ); + assert_eq!( + prefixed_states.get_at(repo_path("b"), RepoPathComponent::new("d")), + None + ); + assert_eq!( + prefixed_states.get_at(repo_path("b"), RepoPathComponent::new("d#")), + Some(new_state(2)) + ); + + // At nested prefixed dir + let prefixed_states = + prefixed_states.prefixed_at(repo_path("b"), RepoPathComponent::new("d")); + assert_eq!( + prefixed_states.paths().collect_vec(), + ["b/d/e"].map(repo_path) + ); + assert_eq!( + prefixed_states.get_at(repo_path("b/d"), RepoPathComponent::new("e")), + Some(new_state(1)) + ); + assert_eq!( + prefixed_states.get_at(repo_path("b/d"), RepoPathComponent::new("#")), + None + ); + + // At prefixed file + let prefixed_states = + file_states.prefixed_at(RepoPath::root(), RepoPathComponent::new("b#")); + assert_eq!(prefixed_states.paths().collect_vec(), ["b#"].map(repo_path)); + assert_eq!( + prefixed_states.get_at(repo_path("b#"), RepoPathComponent::new("#")), + None + ); + } }