Skip to content

Commit

Permalink
revset: add diff_contains(text[, files]) to search diffs
Browse files Browse the repository at this point in the history
The text pattern is applied prior to comparison as we do in Mercurial. This
might affect hunk selection, but is much faster than computing diff of full
file contents. For example, the following hunk wouldn't be caught by
diff_contains("a") because the line "b\n" is filtered out:

    - a
      b
    + a

Closes #2933
  • Loading branch information
yuja committed Jul 17, 2024
1 parent eabff4c commit 895eead
Show file tree
Hide file tree
Showing 5 changed files with 268 additions and 2 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

* The `file()` revset function now accepts fileset as argument.

* New `diff_contains()` revset function can be used to search diffs.

### Fixed bugs

* `jj diff --git` no longer shows the contents of binary files.
Expand Down
10 changes: 10 additions & 0 deletions docs/revsets.md
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,16 @@ given [string pattern](#string-patterns).
Some file patterns might need quoting because the `expression` must also be
parsable as a revset. For example, `.` has to be quoted in `file(".")`.

* `diff_contains(text[, files])`: Commits containing diffs matching the given
`text` pattern line by line.

The search paths can be narrowed by the `files` expression. All modified files
are scanned by default, but it is likely to change in future version to
respect the command line path arguments.

For example, `diff_contains("TODO", "src")` will search revisions where "TODO"
is added to or removed from files under "src".

* `conflict()`: Commits with conflicts.

* `present(x)`: Same as `x`, but evaluated to `none()` if any of the commits
Expand Down
87 changes: 85 additions & 2 deletions lib/src/default_index/revset_engine.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,17 @@ use std::collections::{BTreeSet, BinaryHeap, HashSet};
use std::ops::Range;
use std::rc::Rc;
use std::sync::Arc;
use std::{fmt, iter};
use std::{fmt, iter, str};

use futures::StreamExt as _;
use itertools::Itertools;
use pollster::FutureExt as _;

use super::rev_walk::{EagerRevWalk, PeekableRevWalk, RevWalk, RevWalkBuilder};
use super::revset_graph_iterator::RevsetGraphWalk;
use crate::backend::{BackendResult, ChangeId, CommitId, MillisSinceEpoch};
use crate::backend::{BackendError, BackendResult, ChangeId, CommitId, MillisSinceEpoch};
use crate::commit::Commit;
use crate::conflicts::{materialized_diff_stream, MaterializedTreeValue};
use crate::default_index::{AsCompositeIndex, CompositeIndex, IndexPosition};
use crate::graph::GraphEdge;
use crate::matchers::{Matcher, Visit};
Expand All @@ -37,6 +40,7 @@ use crate::revset::{
RevsetFilterPredicate, GENERATION_RANGE_FULL,
};
use crate::store::Store;
use crate::str_util::StringPattern;
use crate::{rewrite, union_find};

type BoxedPredicateFn<'a> = Box<dyn FnMut(&CompositeIndex, IndexPosition) -> bool + 'a>;
Expand Down Expand Up @@ -1078,6 +1082,16 @@ fn build_predicate_fn(
has_diff_from_parent(&store, index, &commit, matcher.as_ref()).unwrap()
})
}
RevsetFilterPredicate::DiffContains { text, files } => {
let text_pattern = text.clone();
let files_matcher: Rc<dyn Matcher> = files.to_matcher().into();
box_pure_predicate_fn(move |index, pos| {
let entry = index.entry_by_pos(pos);
let commit = store.get_commit(&entry.commit_id()).unwrap();
matches_diff_from_parent(&store, index, &commit, &text_pattern, &*files_matcher)
.unwrap()
})
}
RevsetFilterPredicate::HasConflict => box_pure_predicate_fn(move |index, pos| {
let entry = index.entry_by_pos(pos);
let commit = store.get_commit(&entry.commit_id()).unwrap();
Expand Down Expand Up @@ -1115,6 +1129,75 @@ fn has_diff_from_parent(
Ok(from_tree.diff(&to_tree, matcher).next().is_some())
}

fn matches_diff_from_parent(
store: &Arc<Store>,
index: &CompositeIndex,
commit: &Commit,
text_pattern: &StringPattern,
files_matcher: &dyn Matcher,
) -> BackendResult<bool> {
let parents: Vec<_> = commit.parents().try_collect()?;
let from_tree = rewrite::merge_commit_trees_without_repo(store, &index, &parents)?;
let to_tree = commit.tree()?;
let tree_diff = from_tree.diff_stream(&to_tree, files_matcher);
// Conflicts are compared in materialized form. Alternatively, conflict
// pairs can be compared one by one. #4062
let mut diff_stream = materialized_diff_stream(store, tree_diff);
async {
while let Some((path, diff)) = diff_stream.next().await {
let (left_value, right_value) = diff?;
let left_content = to_file_content(&path, left_value)?;
let right_content = to_file_content(&path, right_value)?;
// Filter lines prior to comparison. This might produce inferior
// hunks due to lack of contexts, but is way faster than full diff.
let left_lines = match_lines(&left_content, text_pattern);
let right_lines = match_lines(&right_content, text_pattern);
if left_lines.ne(right_lines) {
return Ok(true);
}
}
Ok(false)
}
.block_on()
}

fn match_lines<'a: 'b, 'b>(
text: &'a [u8],
pattern: &'b StringPattern,
) -> impl Iterator<Item = &'a [u8]> + 'b {
// The pattern is matched line by line so that it can be anchored to line
// start/end. For example, exact:"" will match blank lines.
text.split_inclusive(|b| *b == b'\n').filter(|line| {
let line = line.strip_suffix(b"\n").unwrap_or(line);
// TODO: add .matches_bytes() or .to_bytes_matcher()
str::from_utf8(line).map_or(false, |line| pattern.matches(line))
})
}

fn to_file_content(path: &RepoPath, value: MaterializedTreeValue) -> BackendResult<Vec<u8>> {
match value {
MaterializedTreeValue::Absent => Ok(vec![]),
MaterializedTreeValue::AccessDenied(_) => Ok(vec![]),
MaterializedTreeValue::File { id, mut reader, .. } => {
let mut content = vec![];
reader
.read_to_end(&mut content)
.map_err(|err| BackendError::ReadFile {
path: path.to_owned(),
id: id.clone(),
source: err.into(),
})?;
Ok(content)
}
MaterializedTreeValue::Symlink { id: _, target } => Ok(target.into_bytes()),
MaterializedTreeValue::GitSubmodule(_) => Ok(vec![]),
MaterializedTreeValue::Conflict { contents, .. } => Ok(contents),
MaterializedTreeValue::Tree(id) => {
panic!("Unexpected tree with id {id:?} in diff at path {path:?}");
}
}
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down
25 changes: 25 additions & 0 deletions lib/src/revset.rs
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,11 @@ pub enum RevsetFilterPredicate {
Committer(StringPattern),
/// Commits modifying the paths specified by the fileset.
File(FilesetExpression),
/// Commits containing diffs matching the `text` pattern within the `files`.
DiffContains {
text: StringPattern,
files: FilesetExpression,
},
/// Commits with conflicts
HasConflict,
/// Custom predicates provided by extensions
Expand Down Expand Up @@ -714,6 +719,26 @@ static BUILTIN_FUNCTION_MAP: Lazy<HashMap<&'static str, RevsetFunction>> = Lazy:
let expr = FilesetExpression::union_all(file_expressions);
Ok(RevsetExpression::filter(RevsetFilterPredicate::File(expr)))
});
map.insert("diff_contains", |function, context| {
let ([text_arg], [files_opt_arg]) = function.expect_arguments()?;
let text = expect_string_pattern(text_arg)?;
let files = if let Some(files_arg) = files_opt_arg {
let ctx = context.workspace.as_ref().ok_or_else(|| {
RevsetParseError::with_span(
RevsetParseErrorKind::FsPathWithoutWorkspace,
files_arg.span,
)
})?;
expect_fileset_expression(files_arg, ctx.path_converter)?
} else {
// TODO: defaults to CLI path arguments?
// https://github.com/martinvonz/jj/issues/2933#issuecomment-1925870731
FilesetExpression::all()
};
Ok(RevsetExpression::filter(
RevsetFilterPredicate::DiffContains { text, files },
))
});
map.insert("conflict", |function, _context| {
function.expect_no_arguments()?;
Ok(RevsetExpression::filter(RevsetFilterPredicate::HasConflict))
Expand Down
146 changes: 146 additions & 0 deletions lib/tests/test_revset.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3012,6 +3012,152 @@ fn test_evaluate_expression_file() {
);
}

#[test]
fn test_evaluate_expression_diff_contains() {
let settings = testutils::user_settings();
let test_workspace = TestWorkspace::init(&settings);
let repo = &test_workspace.repo;

let mut tx = repo.start_transaction(&settings);
let mut_repo = tx.mut_repo();

let empty_clean_inserted_deleted =
RepoPath::from_internal_string("empty_clean_inserted_deleted");
let blank_clean_inserted_clean = RepoPath::from_internal_string("blank_clean_inserted_clean");
let noeol_modified_modified_clean =
RepoPath::from_internal_string("noeol_modified_modified_clean");
let normal_inserted_modified_removed =
RepoPath::from_internal_string("normal_inserted_modified_removed");
let tree1 = create_tree(
repo,
&[
(empty_clean_inserted_deleted, ""),
(blank_clean_inserted_clean, "\n"),
(noeol_modified_modified_clean, "1"),
(normal_inserted_modified_removed, "1\n"),
],
);
let tree2 = create_tree(
repo,
&[
(empty_clean_inserted_deleted, ""),
(blank_clean_inserted_clean, "\n"),
(noeol_modified_modified_clean, "2"),
(normal_inserted_modified_removed, "1\n2\n"),
],
);
let tree3 = create_tree(
repo,
&[
(empty_clean_inserted_deleted, "3"),
(blank_clean_inserted_clean, "\n3\n"),
(noeol_modified_modified_clean, "2 3"),
(normal_inserted_modified_removed, "1 3\n2\n"),
],
);
let tree4 = create_tree(
repo,
&[
(empty_clean_inserted_deleted, ""),
(blank_clean_inserted_clean, "\n3\n"),
(noeol_modified_modified_clean, "2 3"),
// normal_inserted_modified_removed
],
);
let commit1 = mut_repo
.new_commit(
&settings,
vec![repo.store().root_commit_id().clone()],
tree1.id(),
)
.write()
.unwrap();
let commit2 = mut_repo
.new_commit(&settings, vec![commit1.id().clone()], tree2.id())
.write()
.unwrap();
let commit3 = mut_repo
.new_commit(&settings, vec![commit2.id().clone()], tree3.id())
.write()
.unwrap();
let commit4 = mut_repo
.new_commit(&settings, vec![commit3.id().clone()], tree4.id())
.write()
.unwrap();

let query = |revset_str: &str| {
resolve_commit_ids_in_workspace(
mut_repo,
revset_str,
&test_workspace.workspace,
Some(test_workspace.workspace.workspace_root()),
)
};

// should match both inserted and deleted lines
assert_eq!(
query("diff_contains('2')"),
vec![
commit4.id().clone(),
commit3.id().clone(),
commit2.id().clone(),
]
);
assert_eq!(
query("diff_contains('3')"),
vec![commit4.id().clone(), commit3.id().clone()]
);
assert_eq!(query("diff_contains('2 3')"), vec![commit3.id().clone()]);
assert_eq!(
query("diff_contains('1 3')"),
vec![commit4.id().clone(), commit3.id().clone()]
);

// should match line with eol
assert_eq!(
query(&format!(
"diff_contains(exact:'1', {normal_inserted_modified_removed:?})",
)),
vec![commit3.id().clone(), commit1.id().clone()]
);

// should match line without eol
assert_eq!(
query(&format!(
"diff_contains(exact:'1', {noeol_modified_modified_clean:?})",
)),
vec![commit2.id().clone(), commit1.id().clone()]
);

// exact:'' should match blank line
assert_eq!(
query(&format!(
"diff_contains(exact:'', {empty_clean_inserted_deleted:?})",
)),
vec![]
);
assert_eq!(
query(&format!(
"diff_contains(exact:'', {blank_clean_inserted_clean:?})",
)),
vec![commit1.id().clone()]
);

// '' should match anything but clean
assert_eq!(
query(&format!(
"diff_contains('', {empty_clean_inserted_deleted:?})",
)),
vec![commit4.id().clone(), commit3.id().clone()]
);
assert_eq!(
query(&format!(
"diff_contains('', {blank_clean_inserted_clean:?})",
)),
vec![commit3.id().clone(), commit1.id().clone()]
);
}

#[test]
fn test_evaluate_expression_conflict() {
let settings = testutils::user_settings();
Expand Down

0 comments on commit 895eead

Please sign in to comment.