Skip to content

Commit

Permalink
diff: consider uncommon words to match only if they have the same count
Browse files Browse the repository at this point in the history
Patience diff starts by lining up unique elements (e.g. lines) to find
matching segments of the inputs. After that, it refines the
non-matching segments by repeating the process. Histogram expands on
that by not just considering unique elements but by continuing with
elements of count 2, then 3, etc.

Before this commit, when diffing "a b a b b" against "a b a b a b", we
would match the two "a"s in the first input against the first two "a"s
in the second input. After this patch, we ignore the "a"s because
their counts differ, so we try to align the "b"s instead.

I have had this commit lying around since I wrote the histogram diff
implementation 18 months ago. I vaguely remember thinking that the way
I had implemented it (without this commit) was a bit weird, but I
wasn't sure if this commit would be an improvement or not. The bug
report from @chooglen today of a case where we behave differently from
Git is enough to make me think that we make this change after all.

Many unit tests of the diff algorithm are affected, mostly because we
no longer match the leading space in " a " against the space in " b"
and similar.

Closes #761.
  • Loading branch information
martinvonz committed Nov 18, 2022
1 parent f9f4f8b commit 6cca609
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 50 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

* `jj git import` no longer crashes when all Git refs are removed.

* (#761) Fixed a bug in the histogram diff algorithm.

## [0.5.1] - 2022-10-17

No changes (just trying to get automated GitHub release to work).
Expand Down
84 changes: 35 additions & 49 deletions lib/src/diff.rs
Original file line number Diff line number Diff line change
Expand Up @@ -188,26 +188,20 @@ pub(crate) fn unchanged_ranges(
// the LCS.
let mut uncommon_shared_words = vec![];
while !left_histogram.count_to_words.is_empty() && uncommon_shared_words.is_empty() {
let left_count = *left_histogram.count_to_words.first_key().unwrap();
let left_words = left_histogram.count_to_words.pop_first_value().unwrap();
for left_word in left_words {
if right_histogram.word_to_positions.contains_key(left_word) {
uncommon_shared_words.push(left_word);
if let Some(right_positions) = right_histogram.word_to_positions.get(left_word) {
if right_positions.len() == left_count {
uncommon_shared_words.push(left_word);
}
}
}
}
if uncommon_shared_words.is_empty() {
return vec![];
}

// Let's say our inputs are "a b a b" and "a b c c b a b". We will have found
// the least common words to be "a" and "b". We now assume that each
// occurrence of each word lines up in the left and right input. We do that
// by numbering the shared occurrences, effectively instead comparing "a1 b1
// a2 b2" and "a1 b1 c c b2 a2 b". We then walk the common words in the
// right input in order (["a1", "b1", "b2", "a2"]), and record the index of
// that word in the left input ([0,1,3,2]). We then find the LCS and split
// points based on that ([0,1,3] or [0,1,2] are both valid).

// [(index into left_ranges, word, occurrence #)]
let mut left_positions = vec![];
let mut right_positions = vec![];
Expand All @@ -220,8 +214,8 @@ pub(crate) fn unchanged_ranges(
.word_to_positions
.get_mut(uncommon_shared_word)
.unwrap();
let shared_count = min(left_occurrences.len(), right_occurrences.len());
for occurrence in 0..shared_count {
assert_eq!(left_occurrences.len(), right_occurrences.len());
for occurrence in 0..left_occurrences.len() {
left_positions.push((
left_occurrences[occurrence],
uncommon_shared_word,
Expand Down Expand Up @@ -751,27 +745,29 @@ mod tests {

#[test]
fn test_unchanged_ranges_non_unique_removed() {
// We used to consider the first two "a" in the first input to match the two "a"s in the second input. We no longer do.
assert_eq!(
unchanged_ranges(
b"a a a a",
b"a b a c",
&[0..1, 2..3, 4..5, 6..7],
&[0..1, 2..3, 4..5, 6..7],
),
vec![(0..1, 0..1), (2..3, 4..5)]
vec![]
);
}

#[test]
fn test_unchanged_ranges_non_unique_added() {
// We used to consider the first two "a" in the first input to match the two "a"s in the second input. We no longer do.
assert_eq!(
unchanged_ranges(
b"a b a c",
b"a a a a",
&[0..1, 2..3, 4..5, 6..7],
&[0..1, 2..3, 4..5, 6..7],
),
vec![(0..1, 0..1), (4..5, 2..3)]
vec![]
);
}

Expand Down Expand Up @@ -898,8 +894,8 @@ mod tests {
assert_eq!(
diff.hunks().collect_vec(),
vec![
DiffHunk::Matching(b"a "),
DiffHunk::Different(vec![b"b ", b"X ", b""]),
DiffHunk::Matching(b"a"),
DiffHunk::Different(vec![b" b ", b" X ", b" "]),
DiffHunk::Matching(b"c"),
DiffHunk::Different(vec![b"", b"", b" X"]),
]
Expand Down Expand Up @@ -936,8 +932,8 @@ mod tests {
assert_eq!(
diff(b"a z", b"a S z"),
vec![
DiffHunk::Matching(b"a "),
DiffHunk::Different(vec![b"", b"S "]),
DiffHunk::Matching(b"a"),
DiffHunk::Different(vec![b" ", b" S "]),
DiffHunk::Matching(b"z"),
]
);
Expand All @@ -948,10 +944,10 @@ mod tests {
assert_eq!(
diff(b"a R R S S z", b"a S S R R z"),
vec![
DiffHunk::Matching(b"a "),
DiffHunk::Different(vec![b"R R ", b""]),
DiffHunk::Matching(b"S S "),
DiffHunk::Different(vec![b"", b"R R "]),
DiffHunk::Matching(b"a"),
DiffHunk::Different(vec![b" R R ", b" "]),
DiffHunk::Matching(b"S S"),
DiffHunk::Different(vec![b" ", b" R R "]),
DiffHunk::Matching(b"z")
],
);
Expand All @@ -965,18 +961,14 @@ mod tests {
b"a r r x q y z q b y q x r r c",
),
vec![
DiffHunk::Matching(b"a "),
DiffHunk::Different(vec![b"q", b"r"]),
DiffHunk::Matching(b" "),
DiffHunk::Different(vec![b"", b"r "]),
DiffHunk::Matching(b"x q y "),
DiffHunk::Different(vec![b"q ", b""]),
DiffHunk::Matching(b"z q b "),
DiffHunk::Different(vec![b"q ", b""]),
DiffHunk::Matching(b"y q x "),
DiffHunk::Different(vec![b"q", b"r"]),
DiffHunk::Matching(b" "),
DiffHunk::Different(vec![b"", b"r "]),
DiffHunk::Matching(b"a"),
DiffHunk::Different(vec![b" q ", b" r r "]),
DiffHunk::Matching(b"x q y"),
DiffHunk::Different(vec![b" q ", b" "]),
DiffHunk::Matching(b"z q b"),
DiffHunk::Different(vec![b" q ", b" "]),
DiffHunk::Matching(b"y q x"),
DiffHunk::Different(vec![b" q ", b" r r "]),
DiffHunk::Matching(b"c"),
]
);
Expand All @@ -994,10 +986,10 @@ mod tests {
b" pub fn write_fmt(&mut self, fmt: fmt::Arguments<\'_>) -> io::Result<()> {\n self.styler().write_fmt(fmt)\n"
),
vec![
DiffHunk::Matching(b" pub fn write_fmt(&mut self, fmt: fmt::Arguments<\'_>) "),
DiffHunk::Different(vec![b"", b"-> io::Result<()> "]),
DiffHunk::Matching(b"{\n self.styler().write_fmt(fmt)"),
DiffHunk::Different(vec![b".unwrap()", b""]),
DiffHunk::Matching(b" pub fn write_fmt(&mut self, fmt: fmt::Arguments<\'_"),
DiffHunk::Different(vec![b">) ", b">) -> io::Result<()> "]),
DiffHunk::Matching(b"{\n self.styler().write_fmt(fmt"),
DiffHunk::Different(vec![b").unwrap()", b")"]),
DiffHunk::Matching(b"\n")
]
);
Expand Down Expand Up @@ -1148,22 +1140,16 @@ int main(int argc, char **argv)
"##,
),
vec![
DiffHunk::Matching(b"/*\n * GIT - The information manager from hell\n *\n * Copyright (C) Linus Torvalds, 2005\n */\n#include \"#cache.h\"\n\n"),
DiffHunk::Different(vec![b"", b"static void create_directories(const char *path)\n{\n\tint len = strlen(path);\n\tchar *buf = malloc(len + 1);\n\tconst char *slash = path;\n\n\twhile ((slash = strchr(slash+1, \'/\')) != NULL) {\n\t\tlen = slash - path;\n\t\tmemcpy(buf, path, len);\n\t\tbuf[len] = 0;\n\t\tmkdir(buf, 0700);\n\t}\n}\n\nstatic int create_file(const char *path)\n{\n\tint fd = open(path, O_WRONLY | O_TRUNC | O_CREAT, 0600);\n\tif (fd < 0) {\n\t\tif (errno == ENOENT) {\n\t\t\tcreate_directories(path);\n\t\t\tfd = open(path, O_WRONLY | O_TRUNC | O_CREAT, 0600);\n\t\t}\n\t}\n\treturn fd;\n}\n\n"]),
DiffHunk::Matching(b"/*\n * GIT - The information manager from hell\n *\n * Copyright (C) Linus Torvalds, 2005\n */\n#include \"#cache.h\"\n"),
DiffHunk::Different(vec![b"\n", b"\nstatic void create_directories(const char *path)\n{\n\tint len = strlen(path);\n\tchar *buf = malloc(len + 1);\n\tconst char *slash = path;\n\n\twhile ((slash = strchr(slash+1, \'/\')) != NULL) {\n\t\tlen = slash - path;\n\t\tmemcpy(buf, path, len);\n\t\tbuf[len] = 0;\n\t\tmkdir(buf, 0700);\n\t}\n}\n\nstatic int create_file(const char *path)\n{\n\tint fd = open(path, O_WRONLY | O_TRUNC | O_CREAT, 0600);\n\tif (fd < 0) {\n\t\tif (errno == ENOENT) {\n\t\t\tcreate_directories(path);\n\t\t\tfd = open(path, O_WRONLY | O_TRUNC | O_CREAT, 0600);\n\t\t}\n\t}\n\treturn fd;\n}\n\n"]),
DiffHunk::Matching(b"static int unpack(unsigned char *sha1)\n{\n\tvoid *buffer;\n\tunsigned long size;\n\tchar type[20];\n\n\tbuffer = read_sha1_file(sha1, type, &size);\n\tif (!buffer)\n\t\tusage(\"unable to read sha1 file\");\n\tif (strcmp(type, \"tree\"))\n\t\tusage(\"expected a \'tree\' node\");\n\twhile (size) {\n\t\tint len = strlen(buffer)+1;\n\t\tunsigned char *sha1 = buffer + len;\n\t\tchar *path = strchr(buffer, \' \')+1;\n"),
DiffHunk::Different(vec![b"", b"\t\tchar *data;\n\t\tunsigned long filesize;\n"]),
DiffHunk::Matching(b"\t\tunsigned int mode;\n"),
DiffHunk::Different(vec![b"", b"\t\tint fd;\n\n"]),
DiffHunk::Matching(b"\t\tif (size < len + 20 || sscanf(buffer, \"%o\", &mode) != 1)\n\t\t\tusage(\"corrupt \'tree\' file\");\n\t\tbuffer = sha1 + 20;\n\t\tsize -= len + 20;\n\t\t"),
DiffHunk::Different(vec![b"printf", b"data = read_sha1_file"]),
DiffHunk::Matching(b"("),
DiffHunk::Different(vec![b"\"%o %s (%s)\\n\", mode, path, sha1_to_hex(", b""]),
DiffHunk::Different(vec![b"printf(\"%o %s (%s)\\n\", mode, path, sha1_to_hex(", b"data = read_sha1_file("]),
DiffHunk::Matching(b"sha1"),
DiffHunk::Different(vec![b"", b", type, &filesize"]),
DiffHunk::Matching(b")"),
DiffHunk::Different(vec![b")", b""]),
DiffHunk::Matching(b";\n"),
DiffHunk::Different(vec![b"", b"\t\tif (!data || strcmp(type, \"blob\"))\n\t\t\tusage(\"tree file refers to bad file data\");\n\t\tfd = create_file(path);\n\t\tif (fd < 0)\n\t\t\tusage(\"unable to create file\");\n\t\tif (write(fd, data, filesize) != filesize)\n\t\t\tusage(\"unable to write file\");\n\t\tfchmod(fd, mode);\n\t\tclose(fd);\n\t\tfree(data);\n"]),
DiffHunk::Different(vec![b"));\n", b", type, &filesize);\n\t\tif (!data || strcmp(type, \"blob\"))\n\t\t\tusage(\"tree file refers to bad file data\");\n\t\tfd = create_file(path);\n\t\tif (fd < 0)\n\t\t\tusage(\"unable to create file\");\n\t\tif (write(fd, data, filesize) != filesize)\n\t\t\tusage(\"unable to write file\");\n\t\tfchmod(fd, mode);\n\t\tclose(fd);\n\t\tfree(data);\n"]),
DiffHunk::Matching(b"\t}\n\treturn 0;\n}\n\nint main(int argc, char **argv)\n{\n\tint fd;\n\tunsigned char sha1[20];\n\n\tif (argc != 2)\n\t\tusage(\"read-tree <key>\");\n\tif (get_sha1_hex(argv[1], sha1) < 0)\n\t\tusage(\"read-tree <key>\");\n\tsha1_file_directory = getenv(DB_ENVIRONMENT);\n\tif (!sha1_file_directory)\n\t\tsha1_file_directory = DEFAULT_DB_ENVIRONMENT;\n\tif (unpack(sha1) < 0)\n\t\tusage(\"unpack failed\");\n\treturn 0;\n}\n")
]
);
Expand Down
3 changes: 2 additions & 1 deletion tests/test_obslog_command.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,11 @@ fn test_obslog_with_or_without_diff() {
@ [email protected] 2001-02-03 04:05:08.000 +07:00 1daafc17fefb
| my description
| Resolved conflict in file1:
| 1 1: <<<<<<<resolved
| 1 : <<<<<<<
| 2 : %%%%%%%
| 3 : +bar
| 4 : >>>>>>>
| 1: resolved
o [email protected] 2001-02-03 04:05:08.000 +07:00 813918f7b4e6 conflict
| my description
o [email protected] 2001-02-03 04:05:08.000 +07:00 8f02f5470c55
Expand Down

0 comments on commit 6cca609

Please sign in to comment.