Skip to content

Commit

Permalink
diff: reuse precomputed hash values
Browse files Browse the repository at this point in the history
This isn't always fast because it increases the chance of cache miss, but in
practice, it makes "jj file annotate" faster. It's still slower than
"git blame", though.

Maybe we should also change the hash function.

```
group                             new                     old
-----                             ---                     ---
bench_diff_git_git_read_tree_c    1.00     45.2±0.38µs    1.29     58.4±0.32µs
bench_diff_lines/modified/10k     1.00     32.7±0.24ms    1.05     34.4±0.17ms
bench_diff_lines/modified/1k      1.00      2.9±0.00ms    1.06      3.1±0.01ms
bench_diff_lines/reversed/10k     1.00     22.7±0.18ms    1.02     23.2±0.29ms
bench_diff_lines/reversed/1k      1.00    439.0±9.46µs    1.19    523.9±6.05µs
bench_diff_lines/unchanged/10k    1.00      2.9±0.06ms    1.20      3.5±0.02ms
bench_diff_lines/unchanged/1k     1.00    240.8±1.03µs    1.30    312.1±1.05µs
```

```
% hyperfine --sort command --warmup 3 --runs 10 -L bin jj-0,jj-1 \
  'target/release-with-debug/{bin} --ignore-working-copy file annotate lib/src/revset.rs'
Benchmark 1: target/release-with-debug/jj-0 ..
  Time (mean ± σ):      1.604 s ±  0.259 s    [User: 1.543 s, System: 0.057 s]
  Range (min … max):    1.348 s …  1.917 s    10 runs

Benchmark 2: target/release-with-debug/jj-1 ..
  Time (mean ± σ):      1.183 s ±  0.026 s    [User: 1.118 s, System: 0.062 s]
  Range (min … max):    1.155 s …  1.237 s    10 runs
```
  • Loading branch information
yuja committed Nov 20, 2024
1 parent 2c65356 commit 05c7da3
Showing 1 changed file with 73 additions and 47 deletions.
120 changes: 73 additions & 47 deletions lib/src/diff.rs
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,10 @@ impl<C: CompareBytes, S: BuildHasher> WordComparator<C, S> {
self.compare.eq(left, right)
}

fn eq_hashed(&self, left: HashedWord<'_>, right: HashedWord<'_>) -> bool {
left.hash == right.hash && self.compare.eq(left.text, right.text)
}

fn hash_one(&self, text: &[u8]) -> u64 {
let mut state = self.hash_builder.build_hasher();
self.compare.hash(text, &mut state);
Expand All @@ -231,20 +235,32 @@ struct LocalWordPosition(usize);
struct DiffSource<'input, 'aux> {
text: &'input BStr,
ranges: &'aux [Range<usize>],
hashes: Vec<u64>,
}

impl<'input, 'aux> DiffSource<'input, 'aux> {
fn new<T: AsRef<[u8]> + ?Sized>(text: &'input T, ranges: &'aux [Range<usize>]) -> Self {
fn new<T: AsRef<[u8]> + ?Sized, C: CompareBytes, S: BuildHasher>(
text: &'input T,
ranges: &'aux [Range<usize>],
comp: &WordComparator<C, S>,
) -> Self {
let text = BStr::new(text);
let hashes = ranges
.iter()
.map(|range| comp.hash_one(&text[range.clone()]))
.collect();
DiffSource {
text: BStr::new(text),
text,
ranges,
hashes,
}
}

fn local(&self) -> LocalDiffSource<'input, '_> {
LocalDiffSource {
text: self.text,
ranges: self.ranges,
hashes: &self.hashes,
global_offset: WordPosition(0),
}
}
Expand All @@ -258,22 +274,33 @@ impl<'input, 'aux> DiffSource<'input, 'aux> {
struct LocalDiffSource<'input, 'aux> {
text: &'input BStr,
ranges: &'aux [Range<usize>],
hashes: &'aux [u64],
/// The number of preceding word ranges excluded from the self `ranges`.
global_offset: WordPosition,
}

impl LocalDiffSource<'_, '_> {
impl<'input> LocalDiffSource<'input, '_> {
fn narrowed(&self, positions: Range<LocalWordPosition>) -> Self {
LocalDiffSource {
text: self.text,
ranges: &self.ranges[positions.start.0..positions.end.0],
hashes: &self.hashes[positions.start.0..positions.end.0],
global_offset: self.map_to_global(positions.start),
}
}

fn map_to_global(&self, position: LocalWordPosition) -> WordPosition {
WordPosition(self.global_offset.0 + position.0)
}

fn hashed_words(
&self,
) -> impl DoubleEndedIterator<Item = HashedWord<'input>> + ExactSizeIterator + '_ {
iter::zip(self.ranges, self.hashes).map(|(range, &hash)| {
let text = &self.text[range.clone()];
HashedWord { hash, text }
})
}
}

struct Histogram<'input> {
Expand All @@ -289,10 +316,7 @@ impl<'input> Histogram<'input> {
max_occurrences: usize,
) -> Self {
let mut word_to_positions: HashTable<HistogramEntry> = HashTable::new();
for (i, range) in source.ranges.iter().enumerate() {
let text = &source.text[range.clone()];
let hash = comp.hash_one(text);
let word = HashedWord { text, hash };
for (i, word) in source.hashed_words().enumerate() {
let (_, positions) = word_to_positions
.entry(
word.hash,
Expand Down Expand Up @@ -408,15 +432,15 @@ fn collect_unchanged_words<C: CompareBytes, S: BuildHasher>(
}

// Trim leading common ranges (i.e. grow previous unchanged region)
let common_leading_len = iter::zip(left.ranges, right.ranges)
.take_while(|&(l, r)| comp.eq(&left.text[l.clone()], &right.text[r.clone()]))
let common_leading_len = iter::zip(left.hashed_words(), right.hashed_words())
.take_while(|&(l, r)| comp.eq_hashed(l, r))
.count();
let left_ranges = &left.ranges[common_leading_len..];
let right_ranges = &right.ranges[common_leading_len..];
let left_hashed_words = left.hashed_words().skip(common_leading_len);
let right_hashed_words = right.hashed_words().skip(common_leading_len);

// Trim trailing common ranges (i.e. grow next unchanged region)
let common_trailing_len = iter::zip(left_ranges.iter().rev(), right_ranges.iter().rev())
.take_while(|&(l, r)| comp.eq(&left.text[l.clone()], &right.text[r.clone()]))
let common_trailing_len = iter::zip(left_hashed_words.rev(), right_hashed_words.rev())
.take_while(|&(l, r)| comp.eq_hashed(l, r))
.count();

found_positions.extend(itertools::chain(
Expand Down Expand Up @@ -623,9 +647,9 @@ impl<'input> Diff<'input> {
) -> Self {
assert_eq!(other_inputs.len(), other_token_ranges.len());
let comp = WordComparator::new(compare);
let base_source = DiffSource::new(base_input, base_token_ranges);
let base_source = DiffSource::new(base_input, base_token_ranges, &comp);
let other_sources = iter::zip(&other_inputs, other_token_ranges)
.map(|(input, token_ranges)| DiffSource::new(input, token_ranges))
.map(|(input, token_ranges)| DiffSource::new(input, token_ranges, &comp))
.collect_vec();
let unchanged_regions = match &*other_sources {
// Consider the whole range of the base input as unchanged compared
Expand Down Expand Up @@ -1162,10 +1186,12 @@ mod tests {
}

fn unchanged_ranges(
left: &DiffSource,
right: &DiffSource,
(left_text, left_ranges): (&[u8], &[Range<usize>]),
(right_text, right_ranges): (&[u8], &[Range<usize>]),
) -> Vec<(Range<usize>, Range<usize>)> {
let comp = WordComparator::new(CompareBytesExactly);
let left = DiffSource::new(left_text, left_ranges, &comp);
let right = DiffSource::new(right_text, right_ranges, &comp);
let mut positions = Vec::new();
collect_unchanged_words(&mut positions, &left.local(), &right.local(), &comp);
positions
Expand All @@ -1178,8 +1204,8 @@ mod tests {
fn test_unchanged_ranges_insert_in_middle() {
assert_eq!(
unchanged_ranges(
&DiffSource::new(b"a b b c", &[0..1, 2..3, 4..5, 6..7]),
&DiffSource::new(b"a b X b c", &[0..1, 2..3, 4..5, 6..7, 8..9]),
(b"a b b c", &[0..1, 2..3, 4..5, 6..7]),
(b"a b X b c", &[0..1, 2..3, 4..5, 6..7, 8..9]),
),
vec![(0..1, 0..1), (2..3, 2..3), (4..5, 6..7), (6..7, 8..9)]
);
Expand All @@ -1191,29 +1217,29 @@ mod tests {
// "a"s in the second input. We no longer do.
assert_eq!(
unchanged_ranges(
&DiffSource::new(b"a a a a", &[0..1, 2..3, 4..5, 6..7]),
&DiffSource::new(b"a b a c", &[0..1, 2..3, 4..5, 6..7]),
(b"a a a a", &[0..1, 2..3, 4..5, 6..7]),
(b"a b a c", &[0..1, 2..3, 4..5, 6..7]),
),
vec![(0..1, 0..1)]
);
assert_eq!(
unchanged_ranges(
&DiffSource::new(b"a a a a", &[0..1, 2..3, 4..5, 6..7]),
&DiffSource::new(b"b a c a", &[0..1, 2..3, 4..5, 6..7]),
(b"a a a a", &[0..1, 2..3, 4..5, 6..7]),
(b"b a c a", &[0..1, 2..3, 4..5, 6..7]),
),
vec![(6..7, 6..7)]
);
assert_eq!(
unchanged_ranges(
&DiffSource::new(b"a a a a", &[0..1, 2..3, 4..5, 6..7]),
&DiffSource::new(b"b a a c", &[0..1, 2..3, 4..5, 6..7]),
(b"a a a a", &[0..1, 2..3, 4..5, 6..7]),
(b"b a a c", &[0..1, 2..3, 4..5, 6..7]),
),
vec![]
);
assert_eq!(
unchanged_ranges(
&DiffSource::new(b"a a a a", &[0..1, 2..3, 4..5, 6..7]),
&DiffSource::new(b"a b c a", &[0..1, 2..3, 4..5, 6..7]),
(b"a a a a", &[0..1, 2..3, 4..5, 6..7]),
(b"a b c a", &[0..1, 2..3, 4..5, 6..7]),
),
vec![(0..1, 0..1), (6..7, 6..7)]
);
Expand All @@ -1225,29 +1251,29 @@ mod tests {
// "a"s in the second input. We no longer do.
assert_eq!(
unchanged_ranges(
&DiffSource::new(b"a b a c", &[0..1, 2..3, 4..5, 6..7]),
&DiffSource::new(b"a a a a", &[0..1, 2..3, 4..5, 6..7]),
(b"a b a c", &[0..1, 2..3, 4..5, 6..7]),
(b"a a a a", &[0..1, 2..3, 4..5, 6..7]),
),
vec![(0..1, 0..1)]
);
assert_eq!(
unchanged_ranges(
&DiffSource::new(b"b a c a", &[0..1, 2..3, 4..5, 6..7]),
&DiffSource::new(b"a a a a", &[0..1, 2..3, 4..5, 6..7]),
(b"b a c a", &[0..1, 2..3, 4..5, 6..7]),
(b"a a a a", &[0..1, 2..3, 4..5, 6..7]),
),
vec![(6..7, 6..7)]
);
assert_eq!(
unchanged_ranges(
&DiffSource::new(b"b a a c", &[0..1, 2..3, 4..5, 6..7]),
&DiffSource::new(b"a a a a", &[0..1, 2..3, 4..5, 6..7]),
(b"b a a c", &[0..1, 2..3, 4..5, 6..7]),
(b"a a a a", &[0..1, 2..3, 4..5, 6..7]),
),
vec![]
);
assert_eq!(
unchanged_ranges(
&DiffSource::new(b"a b c a", &[0..1, 2..3, 4..5, 6..7]),
&DiffSource::new(b"a a a a", &[0..1, 2..3, 4..5, 6..7]),
(b"a b c a", &[0..1, 2..3, 4..5, 6..7]),
(b"a a a a", &[0..1, 2..3, 4..5, 6..7]),
),
vec![(0..1, 0..1), (6..7, 6..7)]
);
Expand All @@ -1258,45 +1284,45 @@ mod tests {
// "|" matches first, then "b" matches within the left/right range.
assert_eq!(
unchanged_ranges(
&DiffSource::new(b"a b | b", &[0..1, 2..3, 4..5, 6..7]),
&DiffSource::new(b"b c d |", &[0..1, 2..3, 4..5, 6..7]),
(b"a b | b", &[0..1, 2..3, 4..5, 6..7]),
(b"b c d |", &[0..1, 2..3, 4..5, 6..7]),
),
vec![(2..3, 0..1), (4..5, 6..7)]
);
assert_eq!(
unchanged_ranges(
&DiffSource::new(b"| b c d", &[0..1, 2..3, 4..5, 6..7]),
&DiffSource::new(b"b | a b", &[0..1, 2..3, 4..5, 6..7]),
(b"| b c d", &[0..1, 2..3, 4..5, 6..7]),
(b"b | a b", &[0..1, 2..3, 4..5, 6..7]),
),
vec![(0..1, 2..3), (2..3, 6..7)]
);
// "|" matches first, then the middle range is trimmed.
assert_eq!(
unchanged_ranges(
&DiffSource::new(b"| b c |", &[0..1, 2..3, 4..5, 6..7]),
&DiffSource::new(b"| b b |", &[0..1, 2..3, 4..5, 6..7]),
(b"| b c |", &[0..1, 2..3, 4..5, 6..7]),
(b"| b b |", &[0..1, 2..3, 4..5, 6..7]),
),
vec![(0..1, 0..1), (2..3, 2..3), (6..7, 6..7)]
);
assert_eq!(
unchanged_ranges(
&DiffSource::new(b"| c c |", &[0..1, 2..3, 4..5, 6..7]),
&DiffSource::new(b"| b c |", &[0..1, 2..3, 4..5, 6..7]),
(b"| c c |", &[0..1, 2..3, 4..5, 6..7]),
(b"| b c |", &[0..1, 2..3, 4..5, 6..7]),
),
vec![(0..1, 0..1), (4..5, 4..5), (6..7, 6..7)]
);
// "|" matches first, then "a", then "b".
assert_eq!(
unchanged_ranges(
&DiffSource::new(b"a b c | a", &[0..1, 2..3, 4..5, 6..7, 8..9]),
&DiffSource::new(b"b a b |", &[0..1, 2..3, 4..5, 6..7]),
(b"a b c | a", &[0..1, 2..3, 4..5, 6..7, 8..9]),
(b"b a b |", &[0..1, 2..3, 4..5, 6..7]),
),
vec![(0..1, 2..3), (2..3, 4..5), (6..7, 6..7)]
);
assert_eq!(
unchanged_ranges(
&DiffSource::new(b"| b a b", &[0..1, 2..3, 4..5, 6..7]),
&DiffSource::new(b"a | a b c", &[0..1, 2..3, 4..5, 6..7, 8..9]),
(b"| b a b", &[0..1, 2..3, 4..5, 6..7]),
(b"a | a b c", &[0..1, 2..3, 4..5, 6..7, 8..9]),
),
vec![(0..1, 2..3), (4..5, 4..5), (6..7, 6..7)]
);
Expand Down

0 comments on commit 05c7da3

Please sign in to comment.