Skip to content

Commit

Permalink
local_working_copy: filter deleted files per directory (or job)
Browse files Browse the repository at this point in the history
This greatly reduces the amount of paths to be sent over the channel.

Benchmark:
- jj-1: previous
- jj-2: this patch

gecko-dev (large):
```
% JJ_CONFIG=/dev/null hyperfine --sort command --warmup 3 --runs 10 -L bin jj-0,jj-1,jj-2 \
  'target/release-with-debug/{bin} -R ~/mirrors/gecko-dev debug snapshot'
Benchmark 2: target/release-with-debug/jj-1 -R ~/mirrors/gecko-dev debug snapshot
  Time (mean ± σ):     659.6 ms ±  11.8 ms    [User: 2673.2 ms, System: 1882.1 ms]
  Range (min … max):   649.2 ms … 690.1 ms    10 runs

Benchmark 3: target/release-with-debug/jj-2 -R ~/mirrors/gecko-dev debug snapshot
  Time (mean ± σ):     440.5 ms ±  11.8 ms    [User: 2811.0 ms, System: 1967.9 ms]
  Range (min … max):   432.7 ms … 473.0 ms    10 runs

Relative speed comparison
        1.50 ±  0.05  target/release-with-debug/jj-1 -R ~/mirrors/gecko-dev debug snapshot
        1.00          target/release-with-debug/jj-2 -R ~/mirrors/gecko-dev debug snapshot
```

linux (mid-size):
```
% JJ_CONFIG=/dev/null hyperfine --sort command --warmup 3 --runs 10 -L bin jj-0,jj-1,jj-2 \
  'target/release-with-debug/{bin} -R ~/mirrors/linux debug snapshot'
Benchmark 2: target/release-with-debug/jj-1 -R ~/mirrors/linux debug snapshot
  Time (mean ± σ):     225.5 ms ±   4.2 ms    [User: 563.3 ms, System: 475.8 ms]
  Range (min … max):   218.7 ms … 231.4 ms    10 runs

Benchmark 3: target/release-with-debug/jj-2 -R ~/mirrors/linux debug snapshot
  Time (mean ± σ):     189.6 ms ±   3.3 ms    [User: 587.6 ms, System: 475.7 ms]
  Range (min … max):   183.7 ms … 196.5 ms    10 runs

Relative speed comparison
        1.19 ±  0.03  target/release-with-debug/jj-1 -R ~/mirrors/linux debug snapshot
        1.00          target/release-with-debug/jj-2 -R ~/mirrors/linux debug snapshot
```
  • Loading branch information
yuja committed Dec 5, 2024
1 parent ad9a0ef commit ef6d9ef
Show file tree
Hide file tree
Showing 2 changed files with 116 additions and 35 deletions.
102 changes: 67 additions & 35 deletions lib/src/local_working_copy.rs
Original file line number Diff line number Diff line change
Expand Up @@ -879,14 +879,14 @@ impl TreeState {

let matcher = IntersectionMatcher::new(sparse_matcher.as_ref(), fsmonitor_matcher);
if matcher.visit(RepoPath::root()).is_nothing() {
// No need to iterate file states to build empty deleted_files.
// No need to load the current tree, set up channels, etc.
self.watchman_clock = watchman_clock;
return Ok(is_dirty);
}

let (tree_entries_tx, tree_entries_rx) = channel();
let (file_states_tx, file_states_rx) = channel();
let (present_files_tx, present_files_rx) = channel();
let (deleted_files_tx, deleted_files_rx) = channel();

trace_span!("traverse filesystem").in_scope(|| -> Result<(), SnapshotError> {
let snapshotter = FileSnapshotter {
Expand All @@ -897,7 +897,7 @@ impl TreeState {
// Move tx sides so they'll be dropped at the end of the scope.
tree_entries_tx,
file_states_tx,
present_files_tx,
deleted_files_tx,
error: OnceLock::new(),
progress,
max_new_file_size,
Expand All @@ -914,34 +914,18 @@ impl TreeState {
})?;

let mut tree_builder = MergedTreeBuilder::new(self.tree_id.clone());
let mut deleted_files: HashSet<_> =
trace_span!("collecting existing files").in_scope(|| {
// Since file_states shouldn't contain files excluded by the sparse patterns,
// fsmonitor_matcher here is identical to the intersected matcher.
let file_states = self.file_states.all();
file_states
.iter()
.filter(|(path, state)| {
fsmonitor_matcher.matches(path) && state.file_type != FileType::GitSubmodule
})
.map(|(path, _state)| path.to_owned())
.collect()
});
trace_span!("process tree entries").in_scope(|| {
for (path, tree_values) in &tree_entries_rx {
tree_builder.set_or_remove(path, tree_values);
}
});
trace_span!("process present files").in_scope(|| {
for path in &present_files_rx {
deleted_files.remove(&path);
}
});
trace_span!("process deleted tree entries").in_scope(|| {
let deleted_files = trace_span!("process deleted tree entries").in_scope(|| {
let deleted_files = HashSet::from_iter(deleted_files_rx);
is_dirty |= !deleted_files.is_empty();
for file in &deleted_files {
tree_builder.set_or_remove(file.clone(), Merge::absent());
}
deleted_files
});
trace_span!("process file states").in_scope(|| {
let changed_file_states = file_states_rx
Expand Down Expand Up @@ -1024,6 +1008,12 @@ struct DirectoryToVisit<'a> {
file_states: FileStates<'a>,
}

#[derive(Clone, Debug, Default)]
struct PresentDirEntries {
dirs: HashSet<String>,
files: HashSet<String>,
}

/// Helper to scan local-disk directories and files in parallel.
struct FileSnapshotter<'a> {
tree_state: &'a TreeState,
Expand All @@ -1032,7 +1022,7 @@ struct FileSnapshotter<'a> {
start_tracking_matcher: &'a dyn Matcher,
tree_entries_tx: Sender<(RepoPathBuf, MergedTreeValue)>,
file_states_tx: Sender<(RepoPathBuf, FileState)>,
present_files_tx: Sender<RepoPathBuf>,
deleted_files_tx: Sender<RepoPathBuf>,
error: OnceLock<SnapshotError>,
progress: Option<&'a SnapshotProgress<'a>>,
max_new_file_size: u64,
Expand Down Expand Up @@ -1076,6 +1066,7 @@ impl FileSnapshotter<'_> {

let git_ignore = git_ignore
.chain_with_file(&dir.to_internal_dir_string(), disk_dir.join(".gitignore"))?;
let mut present_entries = PresentDirEntries::default();
let to_read_dir_err = |err: io::Error| SnapshotError::Other {
message: format!("Failed to read directory {}", disk_dir.display()),
err: err.into(),
Expand All @@ -1085,15 +1076,13 @@ impl FileSnapshotter<'_> {
let file_type = entry.file_type().unwrap();
let file_name = entry.file_name();
let name = file_name
.to_str()
.ok_or_else(|| SnapshotError::InvalidUtf8Path {
path: file_name.clone(),
})?;
.into_string()
.map_err(|path| SnapshotError::InvalidUtf8Path { path })?;

if RESERVED_DIR_NAMES.contains(&name) {
if RESERVED_DIR_NAMES.contains(&name.as_str()) {
continue;
}
let path = dir.join(RepoPathComponent::new(name));
let path = dir.join(RepoPathComponent::new(&name));
let maybe_current_file_state = file_states.get(&path);
if let Some(file_state) = &maybe_current_file_state {
if file_state.file_type == FileType::GitSubmodule {
Expand All @@ -1103,6 +1092,10 @@ impl FileSnapshotter<'_> {

if file_type.is_dir() {
let file_states = file_states.prefixed(&path);
// Mark the directory as present. Whether or not the directory
// path matches, any child file entries shouldn't be touched
// within the current recursion step.
present_entries.dirs.insert(name);
if git_ignore.matches(&path.to_internal_dir_string())
|| self.start_tracking_matcher.visit(&path).is_nothing()
{
Expand Down Expand Up @@ -1152,6 +1145,7 @@ impl FileSnapshotter<'_> {
});
}
if let Some(new_file_state) = file_state(&metadata) {
present_entries.files.insert(name);
self.process_present_file(
path,
&entry.path(),
Expand All @@ -1162,6 +1156,7 @@ impl FileSnapshotter<'_> {
}
}
}
self.emit_deleted_files(&dir, file_states, &present_entries);
Ok(())
}

Expand All @@ -1172,24 +1167,24 @@ impl FileSnapshotter<'_> {
}
let disk_path = tracked_path.to_fs_path(&self.tree_state.working_copy_path)?;
let metadata = match disk_path.symlink_metadata() {
Ok(metadata) => metadata,
Err(err) if err.kind() == io::ErrorKind::NotFound => {
continue;
}
Ok(metadata) => Some(metadata),
Err(err) if err.kind() == io::ErrorKind::NotFound => None,
Err(err) => {
return Err(SnapshotError::Other {
message: format!("Failed to stat file {}", disk_path.display()),
err: err.into(),
});
}
};
if let Some(new_file_state) = file_state(&metadata) {
if let Some(new_file_state) = metadata.as_ref().and_then(file_state) {
self.process_present_file(
tracked_path.to_owned(),
&disk_path,
Some(&current_file_state),
new_file_state,
)?;
} else {
self.deleted_files_tx.send(tracked_path.to_owned()).ok();
}
}
Ok(())
Expand All @@ -1202,7 +1197,6 @@ impl FileSnapshotter<'_> {
maybe_current_file_state: Option<&FileState>,
new_file_state: FileState,
) -> Result<(), SnapshotError> {
self.present_files_tx.send(path.clone()).ok();
let update = self.get_updated_tree_value(
&path,
disk_path,
Expand All @@ -1218,6 +1212,44 @@ impl FileSnapshotter<'_> {
Ok(())
}

/// Emits file paths that don't exist in the `present_entries`.
fn emit_deleted_files(
&self,
dir: &RepoPath,
file_states: FileStates<'_>,
present_entries: &PresentDirEntries,
) {
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
enum Kind {
Dir,
File,
}

let file_state_chunks = file_states.iter().chunk_by(|(path, _state)| {
// Extract <name> from <dir>, <dir>/<name>, or <dir>/<name>/**.
// (file_states may contain <dir> file on file->dir transition.)
debug_assert!(path.starts_with(dir));
let len = dir.as_internal_file_string().len() + (!dir.is_root() as usize);
let tail = path.as_internal_file_string().get(len..).unwrap_or("");
match tail.split_once('/') {
Some((name, _)) => (Kind::Dir, name),
None => (Kind::File, tail),
}
});
file_state_chunks
.into_iter()
.filter(|&((kind, name), _)| match kind {
Kind::Dir => !present_entries.dirs.contains(name),
Kind::File => !present_entries.files.contains(name),
})
.flat_map(|(_, chunk)| chunk)
// Whether or not the entry exists, submodule should be ignored
.filter(|(_, state)| state.file_type != FileType::GitSubmodule)
.filter(|(path, _)| self.matcher.matches(path))
.try_for_each(|(path, _)| self.deleted_files_tx.send(path.to_owned()))
.ok();
}

fn get_updated_tree_value(
&self,
repo_path: &RepoPath,
Expand Down
49 changes: 49 additions & 0 deletions lib/tests/test_local_working_copy.rs
Original file line number Diff line number Diff line change
Expand Up @@ -750,6 +750,55 @@ fn test_checkout_discard() {
assert!(!reloaded_wc.file_states().unwrap().contains_path(file2_path));
}

#[test]
fn test_snapshot_file_directory_transition() {
let settings = testutils::user_settings();
let mut test_workspace = TestWorkspace::init(&settings);
let repo = test_workspace.repo.clone();
let workspace_root = test_workspace.workspace.workspace_root().to_owned();
let checkout_options = CheckoutOptions::empty_for_test();
let to_ws_path = |path: &RepoPath| path.to_fs_path(&workspace_root).unwrap();

// file <-> directory transition at root and sub directories
let file1_path = RepoPath::from_internal_string("foo/bar");
let file2_path = RepoPath::from_internal_string("sub/bar/baz");
let file1p_path = file1_path.parent().unwrap();
let file2p_path = file2_path.parent().unwrap();

let tree1 = create_tree(&repo, &[(file1p_path, "1p"), (file2p_path, "2p")]);
let tree2 = create_tree(&repo, &[(file1_path, "1"), (file2_path, "2")]);
let commit1 = commit_with_tree(repo.store(), tree1.id());
let commit2 = commit_with_tree(repo.store(), tree2.id());

let ws = &mut test_workspace.workspace;
ws.check_out(repo.op_id().clone(), None, &commit1, &checkout_options)
.unwrap();

// file -> directory
std::fs::remove_file(to_ws_path(file1p_path)).unwrap();
std::fs::remove_file(to_ws_path(file2p_path)).unwrap();
std::fs::create_dir(to_ws_path(file1p_path)).unwrap();
std::fs::create_dir(to_ws_path(file2p_path)).unwrap();
std::fs::write(to_ws_path(file1_path), "1").unwrap();
std::fs::write(to_ws_path(file2_path), "2").unwrap();
let new_tree = test_workspace.snapshot().unwrap();
assert_eq!(new_tree.id(), tree2.id());

let ws = &mut test_workspace.workspace;
ws.check_out(repo.op_id().clone(), None, &commit2, &checkout_options)
.unwrap();

// directory -> file
std::fs::remove_file(to_ws_path(file1_path)).unwrap();
std::fs::remove_file(to_ws_path(file2_path)).unwrap();
std::fs::remove_dir(to_ws_path(file1p_path)).unwrap();
std::fs::remove_dir(to_ws_path(file2p_path)).unwrap();
std::fs::write(to_ws_path(file1p_path), "1p").unwrap();
std::fs::write(to_ws_path(file2p_path), "2p").unwrap();
let new_tree = test_workspace.snapshot().unwrap();
assert_eq!(new_tree.id(), tree1.id());
}

#[test]
fn test_materialize_snapshot_conflicted_files() {
let settings = testutils::user_settings();
Expand Down

0 comments on commit ef6d9ef

Please sign in to comment.