From 4ac9c7624dacbf76fa3d3fce748f7d26f4894774 Mon Sep 17 00:00:00 2001 From: Yuya Nishihara Date: Mon, 8 Jul 2024 20:04:34 +0900 Subject: [PATCH 1/4] diff: move materialized_diff_stream() to jj_lib::conflicts module New diff_contains() revset function will use this helper. --- cli/src/diff_util.rs | 32 ++++---------------------------- lib/src/conflicts.rs | 29 +++++++++++++++++++++++++++-- 2 files changed, 31 insertions(+), 30 deletions(-) diff --git a/cli/src/diff_util.rs b/cli/src/diff_util.rs index 49908bc0b1..26a4ea6505 100644 --- a/cli/src/diff_util.rs +++ b/cli/src/diff_util.rs @@ -18,11 +18,11 @@ use std::ops::Range; use std::path::{Path, PathBuf}; use std::{io, mem}; -use futures::{try_join, Stream, StreamExt}; +use futures::StreamExt; use itertools::Itertools; -use jj_lib::backend::{BackendError, BackendResult, TreeValue}; +use jj_lib::backend::{BackendError, TreeValue}; use jj_lib::commit::Commit; -use jj_lib::conflicts::{materialize_tree_value, MaterializedTreeValue}; +use jj_lib::conflicts::{materialized_diff_stream, MaterializedTreeValue}; use jj_lib::diff::{Diff, DiffHunk}; use jj_lib::files::DiffLine; use jj_lib::matchers::Matcher; @@ -30,7 +30,7 @@ use jj_lib::merge::MergedTreeValue; use jj_lib::merged_tree::{MergedTree, TreeDiffStream}; use jj_lib::object_id::ObjectId; use jj_lib::repo::Repo; -use jj_lib::repo_path::{RepoPath, RepoPathBuf, RepoPathUiConverter}; +use jj_lib::repo_path::{RepoPath, RepoPathUiConverter}; use jj_lib::settings::{ConfigResultExt as _, UserSettings}; use jj_lib::store::Store; use jj_lib::{diff, files}; @@ -995,30 +995,6 @@ fn show_unified_diff_hunks( Ok(()) } -fn materialized_diff_stream<'a>( - store: &'a Store, - tree_diff: TreeDiffStream<'a>, -) -> impl Stream< - Item = ( - RepoPathBuf, - BackendResult<(MaterializedTreeValue, MaterializedTreeValue)>, - ), -> + 'a { - tree_diff - .map(|(path, diff)| async { - match diff { - Err(err) => (path, Err(err)), - Ok((before, after)) => { - let before_future = materialize_tree_value(store, &path, before); - let after_future = materialize_tree_value(store, &path, after); - let values = try_join!(before_future, after_future); - (path, values) - } - } - }) - .buffered((store.concurrency() / 2).max(1)) -} - pub fn show_git_diff( formatter: &mut dyn Formatter, store: &Store, diff --git a/lib/src/conflicts.rs b/lib/src/conflicts.rs index f8d586aa7e..954cbe957e 100644 --- a/lib/src/conflicts.rs +++ b/lib/src/conflicts.rs @@ -17,7 +17,7 @@ use std::io::{Read, Write}; use std::iter::zip; -use futures::{StreamExt, TryStreamExt}; +use futures::{try_join, Stream, StreamExt, TryStreamExt}; use itertools::Itertools; use regex::bytes::Regex; @@ -26,7 +26,8 @@ use crate::diff::{Diff, DiffHunk}; use crate::files; use crate::files::{ContentHunk, MergeResult}; use crate::merge::{Merge, MergeBuilder, MergedTreeValue}; -use crate::repo_path::RepoPath; +use crate::merged_tree::TreeDiffStream; +use crate::repo_path::{RepoPath, RepoPathBuf}; use crate::store::Store; const CONFLICT_START_LINE: &[u8] = b"<<<<<<<"; @@ -326,6 +327,30 @@ fn diff_size(hunks: &[DiffHunk]) -> usize { .sum() } +pub fn materialized_diff_stream<'a>( + store: &'a Store, + tree_diff: TreeDiffStream<'a>, +) -> impl Stream< + Item = ( + RepoPathBuf, + BackendResult<(MaterializedTreeValue, MaterializedTreeValue)>, + ), +> + 'a { + tree_diff + .map(|(path, diff)| async { + match diff { + Err(err) => (path, Err(err)), + Ok((before, after)) => { + let before_future = materialize_tree_value(store, &path, before); + let after_future = materialize_tree_value(store, &path, after); + let values = try_join!(before_future, after_future); + (path, values) + } + } + }) + .buffered((store.concurrency() / 2).max(1)) +} + /// Parses conflict markers from a slice. Returns None if there were no valid /// conflict markers. The caller has to provide the expected number of merge /// sides (adds). Conflict markers that are otherwise valid will be considered From bb33d867cc885119cfad17f973359b7b83e7f93b Mon Sep 17 00:00:00 2001 From: Yuya Nishihara Date: Tue, 9 Jul 2024 19:53:19 +0900 Subject: [PATCH 2/4] revset: pass Commit object to inner file() predicate function Commit object extraction is common across predicate functions. --- lib/src/default_index/revset_engine.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/lib/src/default_index/revset_engine.rs b/lib/src/default_index/revset_engine.rs index 76a495222f..c3c4637abd 100644 --- a/lib/src/default_index/revset_engine.rs +++ b/lib/src/default_index/revset_engine.rs @@ -27,7 +27,8 @@ use itertools::Itertools; use super::rev_walk::{EagerRevWalk, PeekableRevWalk, RevWalk, RevWalkBuilder}; use super::revset_graph_iterator::RevsetGraphWalk; use crate::backend::{ChangeId, CommitId, MillisSinceEpoch}; -use crate::default_index::{AsCompositeIndex, CompositeIndex, IndexEntry, IndexPosition}; +use crate::commit::Commit; +use crate::default_index::{AsCompositeIndex, CompositeIndex, IndexPosition}; use crate::graph::GraphEdge; use crate::matchers::{Matcher, Visit}; use crate::repo_path::RepoPath; @@ -1072,7 +1073,8 @@ fn build_predicate_fn( let matcher: Rc = expr.to_matcher().into(); box_pure_predicate_fn(move |index, pos| { let entry = index.entry_by_pos(pos); - has_diff_from_parent(&store, index, &entry, matcher.as_ref()) + let commit = store.get_commit(&entry.commit_id()).unwrap(); + has_diff_from_parent(&store, index, &commit, matcher.as_ref()) }) } RevsetFilterPredicate::HasConflict => box_pure_predicate_fn(move |index, pos| { @@ -1094,10 +1096,9 @@ fn build_predicate_fn( fn has_diff_from_parent( store: &Arc, index: &CompositeIndex, - entry: &IndexEntry<'_>, + commit: &Commit, matcher: &dyn Matcher, ) -> bool { - let commit = store.get_commit(&entry.commit_id()).unwrap(); let parents: Vec<_> = commit.parents().try_collect().unwrap(); if let [parent] = parents.as_slice() { // Fast path: no need to load the root tree From ba1922665582f8ac877146a170f6c89a9c621c52 Mon Sep 17 00:00:00 2001 From: Yuya Nishihara Date: Tue, 9 Jul 2024 19:56:27 +0900 Subject: [PATCH 3/4] revset: propagate BackendError from inner file() predicate function We should probably add error propagation path to Revset iterator, and predicate functions will return Result. --- lib/src/default_index/revset_engine.rs | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/lib/src/default_index/revset_engine.rs b/lib/src/default_index/revset_engine.rs index c3c4637abd..eec315e2c1 100644 --- a/lib/src/default_index/revset_engine.rs +++ b/lib/src/default_index/revset_engine.rs @@ -26,7 +26,7 @@ use itertools::Itertools; use super::rev_walk::{EagerRevWalk, PeekableRevWalk, RevWalk, RevWalkBuilder}; use super::revset_graph_iterator::RevsetGraphWalk; -use crate::backend::{ChangeId, CommitId, MillisSinceEpoch}; +use crate::backend::{BackendResult, ChangeId, CommitId, MillisSinceEpoch}; use crate::commit::Commit; use crate::default_index::{AsCompositeIndex, CompositeIndex, IndexPosition}; use crate::graph::GraphEdge; @@ -1034,6 +1034,7 @@ fn build_predicate_fn( store: Arc, predicate: &RevsetFilterPredicate, ) -> Box { + // TODO: propagate BackendError match predicate { RevsetFilterPredicate::ParentCount(parent_count_range) => { let parent_count_range = parent_count_range.clone(); @@ -1074,7 +1075,7 @@ fn build_predicate_fn( box_pure_predicate_fn(move |index, pos| { let entry = index.entry_by_pos(pos); let commit = store.get_commit(&entry.commit_id()).unwrap(); - has_diff_from_parent(&store, index, &commit, matcher.as_ref()) + has_diff_from_parent(&store, index, &commit, matcher.as_ref()).unwrap() }) } RevsetFilterPredicate::HasConflict => box_pure_predicate_fn(move |index, pos| { @@ -1098,20 +1099,20 @@ fn has_diff_from_parent( index: &CompositeIndex, commit: &Commit, matcher: &dyn Matcher, -) -> bool { - let parents: Vec<_> = commit.parents().try_collect().unwrap(); +) -> BackendResult { + let parents: Vec<_> = commit.parents().try_collect()?; if let [parent] = parents.as_slice() { // Fast path: no need to load the root tree let unchanged = commit.tree_id() == parent.tree_id(); if matcher.visit(RepoPath::root()) == Visit::AllRecursively { - return !unchanged; + return Ok(!unchanged); } else if unchanged { - return false; + return Ok(false); } } - let from_tree = rewrite::merge_commit_trees_without_repo(store, &index, &parents).unwrap(); - let to_tree = commit.tree().unwrap(); - from_tree.diff(&to_tree, matcher).next().is_some() + let from_tree = rewrite::merge_commit_trees_without_repo(store, &index, &parents)?; + let to_tree = commit.tree()?; + Ok(from_tree.diff(&to_tree, matcher).next().is_some()) } #[cfg(test)] From 9e7e1d40be72d70fd21ed183c50b56188562591f Mon Sep 17 00:00:00 2001 From: Yuya Nishihara Date: Mon, 8 Jul 2024 20:20:41 +0900 Subject: [PATCH 4/4] revset: add diff_contains(text[, files]) to search diffs The text pattern is applied prior to comparison as we do in Mercurial. This might affect hunk selection, but is much faster than computing diff of full file contents. For example, the following hunk wouldn't be caught by diff_contains("a") because the line "b\n" is filtered out: - a b + a Closes #2933 --- CHANGELOG.md | 2 + docs/revsets.md | 10 ++ lib/src/default_index/revset_engine.rs | 87 ++++++++++++++- lib/src/revset.rs | 25 +++++ lib/tests/test_revset.rs | 146 +++++++++++++++++++++++++ 5 files changed, 268 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9d5f79609a..1a7d6bba2c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -63,6 +63,8 @@ to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). * The `file()` revset function now accepts fileset as argument. +* New `diff_contains()` revset function can be used to search diffs. + ### Fixed bugs * `jj diff --git` no longer shows the contents of binary files. diff --git a/docs/revsets.md b/docs/revsets.md index 56b7559277..ada858a24d 100644 --- a/docs/revsets.md +++ b/docs/revsets.md @@ -280,6 +280,16 @@ given [string pattern](#string-patterns). Some file patterns might need quoting because the `expression` must also be parsable as a revset. For example, `.` has to be quoted in `file(".")`. +* `diff_contains(text[, files])`: Commits containing diffs matching the given + `text` pattern line by line. + + The search paths can be narrowed by the `files` expression. All modified files + are scanned by default, but it is likely to change in future version to + respect the command line path arguments. + + For example, `diff_contains("TODO", "src")` will search revisions where "TODO" + is added to or removed from files under "src". + * `conflict()`: Commits with conflicts. * `present(x)`: Same as `x`, but evaluated to `none()` if any of the commits diff --git a/lib/src/default_index/revset_engine.rs b/lib/src/default_index/revset_engine.rs index eec315e2c1..c4d68bb67d 100644 --- a/lib/src/default_index/revset_engine.rs +++ b/lib/src/default_index/revset_engine.rs @@ -20,14 +20,17 @@ use std::collections::{BTreeSet, BinaryHeap, HashSet}; use std::ops::Range; use std::rc::Rc; use std::sync::Arc; -use std::{fmt, iter}; +use std::{fmt, iter, str}; +use futures::StreamExt as _; use itertools::Itertools; +use pollster::FutureExt as _; use super::rev_walk::{EagerRevWalk, PeekableRevWalk, RevWalk, RevWalkBuilder}; use super::revset_graph_iterator::RevsetGraphWalk; -use crate::backend::{BackendResult, ChangeId, CommitId, MillisSinceEpoch}; +use crate::backend::{BackendError, BackendResult, ChangeId, CommitId, MillisSinceEpoch}; use crate::commit::Commit; +use crate::conflicts::{materialized_diff_stream, MaterializedTreeValue}; use crate::default_index::{AsCompositeIndex, CompositeIndex, IndexPosition}; use crate::graph::GraphEdge; use crate::matchers::{Matcher, Visit}; @@ -37,6 +40,7 @@ use crate::revset::{ RevsetFilterPredicate, GENERATION_RANGE_FULL, }; use crate::store::Store; +use crate::str_util::StringPattern; use crate::{rewrite, union_find}; type BoxedPredicateFn<'a> = Box bool + 'a>; @@ -1078,6 +1082,16 @@ fn build_predicate_fn( has_diff_from_parent(&store, index, &commit, matcher.as_ref()).unwrap() }) } + RevsetFilterPredicate::DiffContains { text, files } => { + let text_pattern = text.clone(); + let files_matcher: Rc = files.to_matcher().into(); + box_pure_predicate_fn(move |index, pos| { + let entry = index.entry_by_pos(pos); + let commit = store.get_commit(&entry.commit_id()).unwrap(); + matches_diff_from_parent(&store, index, &commit, &text_pattern, &*files_matcher) + .unwrap() + }) + } RevsetFilterPredicate::HasConflict => box_pure_predicate_fn(move |index, pos| { let entry = index.entry_by_pos(pos); let commit = store.get_commit(&entry.commit_id()).unwrap(); @@ -1115,6 +1129,75 @@ fn has_diff_from_parent( Ok(from_tree.diff(&to_tree, matcher).next().is_some()) } +fn matches_diff_from_parent( + store: &Arc, + index: &CompositeIndex, + commit: &Commit, + text_pattern: &StringPattern, + files_matcher: &dyn Matcher, +) -> BackendResult { + let parents: Vec<_> = commit.parents().try_collect()?; + let from_tree = rewrite::merge_commit_trees_without_repo(store, &index, &parents)?; + let to_tree = commit.tree()?; + let tree_diff = from_tree.diff_stream(&to_tree, files_matcher); + // Conflicts are compared in materialized form. Alternatively, conflict + // pairs can be compared one by one. #4062 + let mut diff_stream = materialized_diff_stream(store, tree_diff); + async { + while let Some((path, diff)) = diff_stream.next().await { + let (left_value, right_value) = diff?; + let left_content = to_file_content(&path, left_value)?; + let right_content = to_file_content(&path, right_value)?; + // Filter lines prior to comparison. This might produce inferior + // hunks due to lack of contexts, but is way faster than full diff. + let left_lines = match_lines(&left_content, text_pattern); + let right_lines = match_lines(&right_content, text_pattern); + if left_lines.ne(right_lines) { + return Ok(true); + } + } + Ok(false) + } + .block_on() +} + +fn match_lines<'a: 'b, 'b>( + text: &'a [u8], + pattern: &'b StringPattern, +) -> impl Iterator + 'b { + // The pattern is matched line by line so that it can be anchored to line + // start/end. For example, exact:"" will match blank lines. + text.split_inclusive(|b| *b == b'\n').filter(|line| { + let line = line.strip_suffix(b"\n").unwrap_or(line); + // TODO: add .matches_bytes() or .to_bytes_matcher() + str::from_utf8(line).map_or(false, |line| pattern.matches(line)) + }) +} + +fn to_file_content(path: &RepoPath, value: MaterializedTreeValue) -> BackendResult> { + match value { + MaterializedTreeValue::Absent => Ok(vec![]), + MaterializedTreeValue::AccessDenied(_) => Ok(vec![]), + MaterializedTreeValue::File { id, mut reader, .. } => { + let mut content = vec![]; + reader + .read_to_end(&mut content) + .map_err(|err| BackendError::ReadFile { + path: path.to_owned(), + id: id.clone(), + source: err.into(), + })?; + Ok(content) + } + MaterializedTreeValue::Symlink { id: _, target } => Ok(target.into_bytes()), + MaterializedTreeValue::GitSubmodule(_) => Ok(vec![]), + MaterializedTreeValue::Conflict { contents, .. } => Ok(contents), + MaterializedTreeValue::Tree(id) => { + panic!("Unexpected tree with id {id:?} in diff at path {path:?}"); + } + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/lib/src/revset.rs b/lib/src/revset.rs index db4ba985f7..afee11cfef 100644 --- a/lib/src/revset.rs +++ b/lib/src/revset.rs @@ -134,6 +134,11 @@ pub enum RevsetFilterPredicate { Committer(StringPattern), /// Commits modifying the paths specified by the fileset. File(FilesetExpression), + /// Commits containing diffs matching the `text` pattern within the `files`. + DiffContains { + text: StringPattern, + files: FilesetExpression, + }, /// Commits with conflicts HasConflict, /// Custom predicates provided by extensions @@ -714,6 +719,26 @@ static BUILTIN_FUNCTION_MAP: Lazy> = Lazy: let expr = FilesetExpression::union_all(file_expressions); Ok(RevsetExpression::filter(RevsetFilterPredicate::File(expr))) }); + map.insert("diff_contains", |function, context| { + let ([text_arg], [files_opt_arg]) = function.expect_arguments()?; + let text = expect_string_pattern(text_arg)?; + let files = if let Some(files_arg) = files_opt_arg { + let ctx = context.workspace.as_ref().ok_or_else(|| { + RevsetParseError::with_span( + RevsetParseErrorKind::FsPathWithoutWorkspace, + files_arg.span, + ) + })?; + expect_fileset_expression(files_arg, ctx.path_converter)? + } else { + // TODO: defaults to CLI path arguments? + // https://github.com/martinvonz/jj/issues/2933#issuecomment-1925870731 + FilesetExpression::all() + }; + Ok(RevsetExpression::filter( + RevsetFilterPredicate::DiffContains { text, files }, + )) + }); map.insert("conflict", |function, _context| { function.expect_no_arguments()?; Ok(RevsetExpression::filter(RevsetFilterPredicate::HasConflict)) diff --git a/lib/tests/test_revset.rs b/lib/tests/test_revset.rs index 13f3d3e210..6add8b4d6c 100644 --- a/lib/tests/test_revset.rs +++ b/lib/tests/test_revset.rs @@ -3012,6 +3012,152 @@ fn test_evaluate_expression_file() { ); } +#[test] +fn test_evaluate_expression_diff_contains() { + let settings = testutils::user_settings(); + let test_workspace = TestWorkspace::init(&settings); + let repo = &test_workspace.repo; + + let mut tx = repo.start_transaction(&settings); + let mut_repo = tx.mut_repo(); + + let empty_clean_inserted_deleted = + RepoPath::from_internal_string("empty_clean_inserted_deleted"); + let blank_clean_inserted_clean = RepoPath::from_internal_string("blank_clean_inserted_clean"); + let noeol_modified_modified_clean = + RepoPath::from_internal_string("noeol_modified_modified_clean"); + let normal_inserted_modified_removed = + RepoPath::from_internal_string("normal_inserted_modified_removed"); + let tree1 = create_tree( + repo, + &[ + (empty_clean_inserted_deleted, ""), + (blank_clean_inserted_clean, "\n"), + (noeol_modified_modified_clean, "1"), + (normal_inserted_modified_removed, "1\n"), + ], + ); + let tree2 = create_tree( + repo, + &[ + (empty_clean_inserted_deleted, ""), + (blank_clean_inserted_clean, "\n"), + (noeol_modified_modified_clean, "2"), + (normal_inserted_modified_removed, "1\n2\n"), + ], + ); + let tree3 = create_tree( + repo, + &[ + (empty_clean_inserted_deleted, "3"), + (blank_clean_inserted_clean, "\n3\n"), + (noeol_modified_modified_clean, "2 3"), + (normal_inserted_modified_removed, "1 3\n2\n"), + ], + ); + let tree4 = create_tree( + repo, + &[ + (empty_clean_inserted_deleted, ""), + (blank_clean_inserted_clean, "\n3\n"), + (noeol_modified_modified_clean, "2 3"), + // normal_inserted_modified_removed + ], + ); + let commit1 = mut_repo + .new_commit( + &settings, + vec![repo.store().root_commit_id().clone()], + tree1.id(), + ) + .write() + .unwrap(); + let commit2 = mut_repo + .new_commit(&settings, vec![commit1.id().clone()], tree2.id()) + .write() + .unwrap(); + let commit3 = mut_repo + .new_commit(&settings, vec![commit2.id().clone()], tree3.id()) + .write() + .unwrap(); + let commit4 = mut_repo + .new_commit(&settings, vec![commit3.id().clone()], tree4.id()) + .write() + .unwrap(); + + let query = |revset_str: &str| { + resolve_commit_ids_in_workspace( + mut_repo, + revset_str, + &test_workspace.workspace, + Some(test_workspace.workspace.workspace_root()), + ) + }; + + // should match both inserted and deleted lines + assert_eq!( + query("diff_contains('2')"), + vec![ + commit4.id().clone(), + commit3.id().clone(), + commit2.id().clone(), + ] + ); + assert_eq!( + query("diff_contains('3')"), + vec![commit4.id().clone(), commit3.id().clone()] + ); + assert_eq!(query("diff_contains('2 3')"), vec![commit3.id().clone()]); + assert_eq!( + query("diff_contains('1 3')"), + vec![commit4.id().clone(), commit3.id().clone()] + ); + + // should match line with eol + assert_eq!( + query(&format!( + "diff_contains(exact:'1', {normal_inserted_modified_removed:?})", + )), + vec![commit3.id().clone(), commit1.id().clone()] + ); + + // should match line without eol + assert_eq!( + query(&format!( + "diff_contains(exact:'1', {noeol_modified_modified_clean:?})", + )), + vec![commit2.id().clone(), commit1.id().clone()] + ); + + // exact:'' should match blank line + assert_eq!( + query(&format!( + "diff_contains(exact:'', {empty_clean_inserted_deleted:?})", + )), + vec![] + ); + assert_eq!( + query(&format!( + "diff_contains(exact:'', {blank_clean_inserted_clean:?})", + )), + vec![commit1.id().clone()] + ); + + // '' should match anything but clean + assert_eq!( + query(&format!( + "diff_contains('', {empty_clean_inserted_deleted:?})", + )), + vec![commit4.id().clone(), commit3.id().clone()] + ); + assert_eq!( + query(&format!( + "diff_contains('', {blank_clean_inserted_clean:?})", + )), + vec![commit3.id().clone(), commit1.id().clone()] + ); +} + #[test] fn test_evaluate_expression_conflict() { let settings = testutils::user_settings();