From 05c7da3db032d6aaf4805c9681ca4b994717ecd9 Mon Sep 17 00:00:00 2001 From: Yuya Nishihara Date: Sun, 22 Sep 2024 20:12:35 +0900 Subject: [PATCH] diff: reuse precomputed hash values MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This isn't always fast because it increases the chance of cache miss, but in practice, it makes "jj file annotate" faster. It's still slower than "git blame", though. Maybe we should also change the hash function. ``` group new old ----- --- --- bench_diff_git_git_read_tree_c 1.00 45.2±0.38µs 1.29 58.4±0.32µs bench_diff_lines/modified/10k 1.00 32.7±0.24ms 1.05 34.4±0.17ms bench_diff_lines/modified/1k 1.00 2.9±0.00ms 1.06 3.1±0.01ms bench_diff_lines/reversed/10k 1.00 22.7±0.18ms 1.02 23.2±0.29ms bench_diff_lines/reversed/1k 1.00 439.0±9.46µs 1.19 523.9±6.05µs bench_diff_lines/unchanged/10k 1.00 2.9±0.06ms 1.20 3.5±0.02ms bench_diff_lines/unchanged/1k 1.00 240.8±1.03µs 1.30 312.1±1.05µs ``` ``` % hyperfine --sort command --warmup 3 --runs 10 -L bin jj-0,jj-1 \ 'target/release-with-debug/{bin} --ignore-working-copy file annotate lib/src/revset.rs' Benchmark 1: target/release-with-debug/jj-0 .. Time (mean ± σ): 1.604 s ± 0.259 s [User: 1.543 s, System: 0.057 s] Range (min … max): 1.348 s … 1.917 s 10 runs Benchmark 2: target/release-with-debug/jj-1 .. Time (mean ± σ): 1.183 s ± 0.026 s [User: 1.118 s, System: 0.062 s] Range (min … max): 1.155 s … 1.237 s 10 runs ``` --- lib/src/diff.rs | 120 +++++++++++++++++++++++++++++------------------- 1 file changed, 73 insertions(+), 47 deletions(-) diff --git a/lib/src/diff.rs b/lib/src/diff.rs index 74be987e3e..54558ed882 100644 --- a/lib/src/diff.rs +++ b/lib/src/diff.rs @@ -212,6 +212,10 @@ impl WordComparator { self.compare.eq(left, right) } + fn eq_hashed(&self, left: HashedWord<'_>, right: HashedWord<'_>) -> bool { + left.hash == right.hash && self.compare.eq(left.text, right.text) + } + fn hash_one(&self, text: &[u8]) -> u64 { let mut state = self.hash_builder.build_hasher(); self.compare.hash(text, &mut state); @@ -231,13 +235,24 @@ struct LocalWordPosition(usize); struct DiffSource<'input, 'aux> { text: &'input BStr, ranges: &'aux [Range], + hashes: Vec, } impl<'input, 'aux> DiffSource<'input, 'aux> { - fn new + ?Sized>(text: &'input T, ranges: &'aux [Range]) -> Self { + fn new + ?Sized, C: CompareBytes, S: BuildHasher>( + text: &'input T, + ranges: &'aux [Range], + comp: &WordComparator, + ) -> Self { + let text = BStr::new(text); + let hashes = ranges + .iter() + .map(|range| comp.hash_one(&text[range.clone()])) + .collect(); DiffSource { - text: BStr::new(text), + text, ranges, + hashes, } } @@ -245,6 +260,7 @@ impl<'input, 'aux> DiffSource<'input, 'aux> { LocalDiffSource { text: self.text, ranges: self.ranges, + hashes: &self.hashes, global_offset: WordPosition(0), } } @@ -258,15 +274,17 @@ impl<'input, 'aux> DiffSource<'input, 'aux> { struct LocalDiffSource<'input, 'aux> { text: &'input BStr, ranges: &'aux [Range], + hashes: &'aux [u64], /// The number of preceding word ranges excluded from the self `ranges`. global_offset: WordPosition, } -impl LocalDiffSource<'_, '_> { +impl<'input> LocalDiffSource<'input, '_> { fn narrowed(&self, positions: Range) -> Self { LocalDiffSource { text: self.text, ranges: &self.ranges[positions.start.0..positions.end.0], + hashes: &self.hashes[positions.start.0..positions.end.0], global_offset: self.map_to_global(positions.start), } } @@ -274,6 +292,15 @@ impl LocalDiffSource<'_, '_> { fn map_to_global(&self, position: LocalWordPosition) -> WordPosition { WordPosition(self.global_offset.0 + position.0) } + + fn hashed_words( + &self, + ) -> impl DoubleEndedIterator> + ExactSizeIterator + '_ { + iter::zip(self.ranges, self.hashes).map(|(range, &hash)| { + let text = &self.text[range.clone()]; + HashedWord { hash, text } + }) + } } struct Histogram<'input> { @@ -289,10 +316,7 @@ impl<'input> Histogram<'input> { max_occurrences: usize, ) -> Self { let mut word_to_positions: HashTable = HashTable::new(); - for (i, range) in source.ranges.iter().enumerate() { - let text = &source.text[range.clone()]; - let hash = comp.hash_one(text); - let word = HashedWord { text, hash }; + for (i, word) in source.hashed_words().enumerate() { let (_, positions) = word_to_positions .entry( word.hash, @@ -408,15 +432,15 @@ fn collect_unchanged_words( } // Trim leading common ranges (i.e. grow previous unchanged region) - let common_leading_len = iter::zip(left.ranges, right.ranges) - .take_while(|&(l, r)| comp.eq(&left.text[l.clone()], &right.text[r.clone()])) + let common_leading_len = iter::zip(left.hashed_words(), right.hashed_words()) + .take_while(|&(l, r)| comp.eq_hashed(l, r)) .count(); - let left_ranges = &left.ranges[common_leading_len..]; - let right_ranges = &right.ranges[common_leading_len..]; + let left_hashed_words = left.hashed_words().skip(common_leading_len); + let right_hashed_words = right.hashed_words().skip(common_leading_len); // Trim trailing common ranges (i.e. grow next unchanged region) - let common_trailing_len = iter::zip(left_ranges.iter().rev(), right_ranges.iter().rev()) - .take_while(|&(l, r)| comp.eq(&left.text[l.clone()], &right.text[r.clone()])) + let common_trailing_len = iter::zip(left_hashed_words.rev(), right_hashed_words.rev()) + .take_while(|&(l, r)| comp.eq_hashed(l, r)) .count(); found_positions.extend(itertools::chain( @@ -623,9 +647,9 @@ impl<'input> Diff<'input> { ) -> Self { assert_eq!(other_inputs.len(), other_token_ranges.len()); let comp = WordComparator::new(compare); - let base_source = DiffSource::new(base_input, base_token_ranges); + let base_source = DiffSource::new(base_input, base_token_ranges, &comp); let other_sources = iter::zip(&other_inputs, other_token_ranges) - .map(|(input, token_ranges)| DiffSource::new(input, token_ranges)) + .map(|(input, token_ranges)| DiffSource::new(input, token_ranges, &comp)) .collect_vec(); let unchanged_regions = match &*other_sources { // Consider the whole range of the base input as unchanged compared @@ -1162,10 +1186,12 @@ mod tests { } fn unchanged_ranges( - left: &DiffSource, - right: &DiffSource, + (left_text, left_ranges): (&[u8], &[Range]), + (right_text, right_ranges): (&[u8], &[Range]), ) -> Vec<(Range, Range)> { let comp = WordComparator::new(CompareBytesExactly); + let left = DiffSource::new(left_text, left_ranges, &comp); + let right = DiffSource::new(right_text, right_ranges, &comp); let mut positions = Vec::new(); collect_unchanged_words(&mut positions, &left.local(), &right.local(), &comp); positions @@ -1178,8 +1204,8 @@ mod tests { fn test_unchanged_ranges_insert_in_middle() { assert_eq!( unchanged_ranges( - &DiffSource::new(b"a b b c", &[0..1, 2..3, 4..5, 6..7]), - &DiffSource::new(b"a b X b c", &[0..1, 2..3, 4..5, 6..7, 8..9]), + (b"a b b c", &[0..1, 2..3, 4..5, 6..7]), + (b"a b X b c", &[0..1, 2..3, 4..5, 6..7, 8..9]), ), vec![(0..1, 0..1), (2..3, 2..3), (4..5, 6..7), (6..7, 8..9)] ); @@ -1191,29 +1217,29 @@ mod tests { // "a"s in the second input. We no longer do. assert_eq!( unchanged_ranges( - &DiffSource::new(b"a a a a", &[0..1, 2..3, 4..5, 6..7]), - &DiffSource::new(b"a b a c", &[0..1, 2..3, 4..5, 6..7]), + (b"a a a a", &[0..1, 2..3, 4..5, 6..7]), + (b"a b a c", &[0..1, 2..3, 4..5, 6..7]), ), vec![(0..1, 0..1)] ); assert_eq!( unchanged_ranges( - &DiffSource::new(b"a a a a", &[0..1, 2..3, 4..5, 6..7]), - &DiffSource::new(b"b a c a", &[0..1, 2..3, 4..5, 6..7]), + (b"a a a a", &[0..1, 2..3, 4..5, 6..7]), + (b"b a c a", &[0..1, 2..3, 4..5, 6..7]), ), vec![(6..7, 6..7)] ); assert_eq!( unchanged_ranges( - &DiffSource::new(b"a a a a", &[0..1, 2..3, 4..5, 6..7]), - &DiffSource::new(b"b a a c", &[0..1, 2..3, 4..5, 6..7]), + (b"a a a a", &[0..1, 2..3, 4..5, 6..7]), + (b"b a a c", &[0..1, 2..3, 4..5, 6..7]), ), vec![] ); assert_eq!( unchanged_ranges( - &DiffSource::new(b"a a a a", &[0..1, 2..3, 4..5, 6..7]), - &DiffSource::new(b"a b c a", &[0..1, 2..3, 4..5, 6..7]), + (b"a a a a", &[0..1, 2..3, 4..5, 6..7]), + (b"a b c a", &[0..1, 2..3, 4..5, 6..7]), ), vec![(0..1, 0..1), (6..7, 6..7)] ); @@ -1225,29 +1251,29 @@ mod tests { // "a"s in the second input. We no longer do. assert_eq!( unchanged_ranges( - &DiffSource::new(b"a b a c", &[0..1, 2..3, 4..5, 6..7]), - &DiffSource::new(b"a a a a", &[0..1, 2..3, 4..5, 6..7]), + (b"a b a c", &[0..1, 2..3, 4..5, 6..7]), + (b"a a a a", &[0..1, 2..3, 4..5, 6..7]), ), vec![(0..1, 0..1)] ); assert_eq!( unchanged_ranges( - &DiffSource::new(b"b a c a", &[0..1, 2..3, 4..5, 6..7]), - &DiffSource::new(b"a a a a", &[0..1, 2..3, 4..5, 6..7]), + (b"b a c a", &[0..1, 2..3, 4..5, 6..7]), + (b"a a a a", &[0..1, 2..3, 4..5, 6..7]), ), vec![(6..7, 6..7)] ); assert_eq!( unchanged_ranges( - &DiffSource::new(b"b a a c", &[0..1, 2..3, 4..5, 6..7]), - &DiffSource::new(b"a a a a", &[0..1, 2..3, 4..5, 6..7]), + (b"b a a c", &[0..1, 2..3, 4..5, 6..7]), + (b"a a a a", &[0..1, 2..3, 4..5, 6..7]), ), vec![] ); assert_eq!( unchanged_ranges( - &DiffSource::new(b"a b c a", &[0..1, 2..3, 4..5, 6..7]), - &DiffSource::new(b"a a a a", &[0..1, 2..3, 4..5, 6..7]), + (b"a b c a", &[0..1, 2..3, 4..5, 6..7]), + (b"a a a a", &[0..1, 2..3, 4..5, 6..7]), ), vec![(0..1, 0..1), (6..7, 6..7)] ); @@ -1258,45 +1284,45 @@ mod tests { // "|" matches first, then "b" matches within the left/right range. assert_eq!( unchanged_ranges( - &DiffSource::new(b"a b | b", &[0..1, 2..3, 4..5, 6..7]), - &DiffSource::new(b"b c d |", &[0..1, 2..3, 4..5, 6..7]), + (b"a b | b", &[0..1, 2..3, 4..5, 6..7]), + (b"b c d |", &[0..1, 2..3, 4..5, 6..7]), ), vec![(2..3, 0..1), (4..5, 6..7)] ); assert_eq!( unchanged_ranges( - &DiffSource::new(b"| b c d", &[0..1, 2..3, 4..5, 6..7]), - &DiffSource::new(b"b | a b", &[0..1, 2..3, 4..5, 6..7]), + (b"| b c d", &[0..1, 2..3, 4..5, 6..7]), + (b"b | a b", &[0..1, 2..3, 4..5, 6..7]), ), vec![(0..1, 2..3), (2..3, 6..7)] ); // "|" matches first, then the middle range is trimmed. assert_eq!( unchanged_ranges( - &DiffSource::new(b"| b c |", &[0..1, 2..3, 4..5, 6..7]), - &DiffSource::new(b"| b b |", &[0..1, 2..3, 4..5, 6..7]), + (b"| b c |", &[0..1, 2..3, 4..5, 6..7]), + (b"| b b |", &[0..1, 2..3, 4..5, 6..7]), ), vec![(0..1, 0..1), (2..3, 2..3), (6..7, 6..7)] ); assert_eq!( unchanged_ranges( - &DiffSource::new(b"| c c |", &[0..1, 2..3, 4..5, 6..7]), - &DiffSource::new(b"| b c |", &[0..1, 2..3, 4..5, 6..7]), + (b"| c c |", &[0..1, 2..3, 4..5, 6..7]), + (b"| b c |", &[0..1, 2..3, 4..5, 6..7]), ), vec![(0..1, 0..1), (4..5, 4..5), (6..7, 6..7)] ); // "|" matches first, then "a", then "b". assert_eq!( unchanged_ranges( - &DiffSource::new(b"a b c | a", &[0..1, 2..3, 4..5, 6..7, 8..9]), - &DiffSource::new(b"b a b |", &[0..1, 2..3, 4..5, 6..7]), + (b"a b c | a", &[0..1, 2..3, 4..5, 6..7, 8..9]), + (b"b a b |", &[0..1, 2..3, 4..5, 6..7]), ), vec![(0..1, 2..3), (2..3, 4..5), (6..7, 6..7)] ); assert_eq!( unchanged_ranges( - &DiffSource::new(b"| b a b", &[0..1, 2..3, 4..5, 6..7]), - &DiffSource::new(b"a | a b c", &[0..1, 2..3, 4..5, 6..7, 8..9]), + (b"| b a b", &[0..1, 2..3, 4..5, 6..7]), + (b"a | a b c", &[0..1, 2..3, 4..5, 6..7, 8..9]), ), vec![(0..1, 2..3), (4..5, 4..5), (6..7, 6..7)] );