diff --git a/lib/src/diff.rs b/lib/src/diff.rs index 74be987e3e..54558ed882 100644 --- a/lib/src/diff.rs +++ b/lib/src/diff.rs @@ -212,6 +212,10 @@ impl WordComparator { self.compare.eq(left, right) } + fn eq_hashed(&self, left: HashedWord<'_>, right: HashedWord<'_>) -> bool { + left.hash == right.hash && self.compare.eq(left.text, right.text) + } + fn hash_one(&self, text: &[u8]) -> u64 { let mut state = self.hash_builder.build_hasher(); self.compare.hash(text, &mut state); @@ -231,13 +235,24 @@ struct LocalWordPosition(usize); struct DiffSource<'input, 'aux> { text: &'input BStr, ranges: &'aux [Range], + hashes: Vec, } impl<'input, 'aux> DiffSource<'input, 'aux> { - fn new + ?Sized>(text: &'input T, ranges: &'aux [Range]) -> Self { + fn new + ?Sized, C: CompareBytes, S: BuildHasher>( + text: &'input T, + ranges: &'aux [Range], + comp: &WordComparator, + ) -> Self { + let text = BStr::new(text); + let hashes = ranges + .iter() + .map(|range| comp.hash_one(&text[range.clone()])) + .collect(); DiffSource { - text: BStr::new(text), + text, ranges, + hashes, } } @@ -245,6 +260,7 @@ impl<'input, 'aux> DiffSource<'input, 'aux> { LocalDiffSource { text: self.text, ranges: self.ranges, + hashes: &self.hashes, global_offset: WordPosition(0), } } @@ -258,15 +274,17 @@ impl<'input, 'aux> DiffSource<'input, 'aux> { struct LocalDiffSource<'input, 'aux> { text: &'input BStr, ranges: &'aux [Range], + hashes: &'aux [u64], /// The number of preceding word ranges excluded from the self `ranges`. global_offset: WordPosition, } -impl LocalDiffSource<'_, '_> { +impl<'input> LocalDiffSource<'input, '_> { fn narrowed(&self, positions: Range) -> Self { LocalDiffSource { text: self.text, ranges: &self.ranges[positions.start.0..positions.end.0], + hashes: &self.hashes[positions.start.0..positions.end.0], global_offset: self.map_to_global(positions.start), } } @@ -274,6 +292,15 @@ impl LocalDiffSource<'_, '_> { fn map_to_global(&self, position: LocalWordPosition) -> WordPosition { WordPosition(self.global_offset.0 + position.0) } + + fn hashed_words( + &self, + ) -> impl DoubleEndedIterator> + ExactSizeIterator + '_ { + iter::zip(self.ranges, self.hashes).map(|(range, &hash)| { + let text = &self.text[range.clone()]; + HashedWord { hash, text } + }) + } } struct Histogram<'input> { @@ -289,10 +316,7 @@ impl<'input> Histogram<'input> { max_occurrences: usize, ) -> Self { let mut word_to_positions: HashTable = HashTable::new(); - for (i, range) in source.ranges.iter().enumerate() { - let text = &source.text[range.clone()]; - let hash = comp.hash_one(text); - let word = HashedWord { text, hash }; + for (i, word) in source.hashed_words().enumerate() { let (_, positions) = word_to_positions .entry( word.hash, @@ -408,15 +432,15 @@ fn collect_unchanged_words( } // Trim leading common ranges (i.e. grow previous unchanged region) - let common_leading_len = iter::zip(left.ranges, right.ranges) - .take_while(|&(l, r)| comp.eq(&left.text[l.clone()], &right.text[r.clone()])) + let common_leading_len = iter::zip(left.hashed_words(), right.hashed_words()) + .take_while(|&(l, r)| comp.eq_hashed(l, r)) .count(); - let left_ranges = &left.ranges[common_leading_len..]; - let right_ranges = &right.ranges[common_leading_len..]; + let left_hashed_words = left.hashed_words().skip(common_leading_len); + let right_hashed_words = right.hashed_words().skip(common_leading_len); // Trim trailing common ranges (i.e. grow next unchanged region) - let common_trailing_len = iter::zip(left_ranges.iter().rev(), right_ranges.iter().rev()) - .take_while(|&(l, r)| comp.eq(&left.text[l.clone()], &right.text[r.clone()])) + let common_trailing_len = iter::zip(left_hashed_words.rev(), right_hashed_words.rev()) + .take_while(|&(l, r)| comp.eq_hashed(l, r)) .count(); found_positions.extend(itertools::chain( @@ -623,9 +647,9 @@ impl<'input> Diff<'input> { ) -> Self { assert_eq!(other_inputs.len(), other_token_ranges.len()); let comp = WordComparator::new(compare); - let base_source = DiffSource::new(base_input, base_token_ranges); + let base_source = DiffSource::new(base_input, base_token_ranges, &comp); let other_sources = iter::zip(&other_inputs, other_token_ranges) - .map(|(input, token_ranges)| DiffSource::new(input, token_ranges)) + .map(|(input, token_ranges)| DiffSource::new(input, token_ranges, &comp)) .collect_vec(); let unchanged_regions = match &*other_sources { // Consider the whole range of the base input as unchanged compared @@ -1162,10 +1186,12 @@ mod tests { } fn unchanged_ranges( - left: &DiffSource, - right: &DiffSource, + (left_text, left_ranges): (&[u8], &[Range]), + (right_text, right_ranges): (&[u8], &[Range]), ) -> Vec<(Range, Range)> { let comp = WordComparator::new(CompareBytesExactly); + let left = DiffSource::new(left_text, left_ranges, &comp); + let right = DiffSource::new(right_text, right_ranges, &comp); let mut positions = Vec::new(); collect_unchanged_words(&mut positions, &left.local(), &right.local(), &comp); positions @@ -1178,8 +1204,8 @@ mod tests { fn test_unchanged_ranges_insert_in_middle() { assert_eq!( unchanged_ranges( - &DiffSource::new(b"a b b c", &[0..1, 2..3, 4..5, 6..7]), - &DiffSource::new(b"a b X b c", &[0..1, 2..3, 4..5, 6..7, 8..9]), + (b"a b b c", &[0..1, 2..3, 4..5, 6..7]), + (b"a b X b c", &[0..1, 2..3, 4..5, 6..7, 8..9]), ), vec![(0..1, 0..1), (2..3, 2..3), (4..5, 6..7), (6..7, 8..9)] ); @@ -1191,29 +1217,29 @@ mod tests { // "a"s in the second input. We no longer do. assert_eq!( unchanged_ranges( - &DiffSource::new(b"a a a a", &[0..1, 2..3, 4..5, 6..7]), - &DiffSource::new(b"a b a c", &[0..1, 2..3, 4..5, 6..7]), + (b"a a a a", &[0..1, 2..3, 4..5, 6..7]), + (b"a b a c", &[0..1, 2..3, 4..5, 6..7]), ), vec![(0..1, 0..1)] ); assert_eq!( unchanged_ranges( - &DiffSource::new(b"a a a a", &[0..1, 2..3, 4..5, 6..7]), - &DiffSource::new(b"b a c a", &[0..1, 2..3, 4..5, 6..7]), + (b"a a a a", &[0..1, 2..3, 4..5, 6..7]), + (b"b a c a", &[0..1, 2..3, 4..5, 6..7]), ), vec![(6..7, 6..7)] ); assert_eq!( unchanged_ranges( - &DiffSource::new(b"a a a a", &[0..1, 2..3, 4..5, 6..7]), - &DiffSource::new(b"b a a c", &[0..1, 2..3, 4..5, 6..7]), + (b"a a a a", &[0..1, 2..3, 4..5, 6..7]), + (b"b a a c", &[0..1, 2..3, 4..5, 6..7]), ), vec![] ); assert_eq!( unchanged_ranges( - &DiffSource::new(b"a a a a", &[0..1, 2..3, 4..5, 6..7]), - &DiffSource::new(b"a b c a", &[0..1, 2..3, 4..5, 6..7]), + (b"a a a a", &[0..1, 2..3, 4..5, 6..7]), + (b"a b c a", &[0..1, 2..3, 4..5, 6..7]), ), vec![(0..1, 0..1), (6..7, 6..7)] ); @@ -1225,29 +1251,29 @@ mod tests { // "a"s in the second input. We no longer do. assert_eq!( unchanged_ranges( - &DiffSource::new(b"a b a c", &[0..1, 2..3, 4..5, 6..7]), - &DiffSource::new(b"a a a a", &[0..1, 2..3, 4..5, 6..7]), + (b"a b a c", &[0..1, 2..3, 4..5, 6..7]), + (b"a a a a", &[0..1, 2..3, 4..5, 6..7]), ), vec![(0..1, 0..1)] ); assert_eq!( unchanged_ranges( - &DiffSource::new(b"b a c a", &[0..1, 2..3, 4..5, 6..7]), - &DiffSource::new(b"a a a a", &[0..1, 2..3, 4..5, 6..7]), + (b"b a c a", &[0..1, 2..3, 4..5, 6..7]), + (b"a a a a", &[0..1, 2..3, 4..5, 6..7]), ), vec![(6..7, 6..7)] ); assert_eq!( unchanged_ranges( - &DiffSource::new(b"b a a c", &[0..1, 2..3, 4..5, 6..7]), - &DiffSource::new(b"a a a a", &[0..1, 2..3, 4..5, 6..7]), + (b"b a a c", &[0..1, 2..3, 4..5, 6..7]), + (b"a a a a", &[0..1, 2..3, 4..5, 6..7]), ), vec![] ); assert_eq!( unchanged_ranges( - &DiffSource::new(b"a b c a", &[0..1, 2..3, 4..5, 6..7]), - &DiffSource::new(b"a a a a", &[0..1, 2..3, 4..5, 6..7]), + (b"a b c a", &[0..1, 2..3, 4..5, 6..7]), + (b"a a a a", &[0..1, 2..3, 4..5, 6..7]), ), vec![(0..1, 0..1), (6..7, 6..7)] ); @@ -1258,45 +1284,45 @@ mod tests { // "|" matches first, then "b" matches within the left/right range. assert_eq!( unchanged_ranges( - &DiffSource::new(b"a b | b", &[0..1, 2..3, 4..5, 6..7]), - &DiffSource::new(b"b c d |", &[0..1, 2..3, 4..5, 6..7]), + (b"a b | b", &[0..1, 2..3, 4..5, 6..7]), + (b"b c d |", &[0..1, 2..3, 4..5, 6..7]), ), vec![(2..3, 0..1), (4..5, 6..7)] ); assert_eq!( unchanged_ranges( - &DiffSource::new(b"| b c d", &[0..1, 2..3, 4..5, 6..7]), - &DiffSource::new(b"b | a b", &[0..1, 2..3, 4..5, 6..7]), + (b"| b c d", &[0..1, 2..3, 4..5, 6..7]), + (b"b | a b", &[0..1, 2..3, 4..5, 6..7]), ), vec![(0..1, 2..3), (2..3, 6..7)] ); // "|" matches first, then the middle range is trimmed. assert_eq!( unchanged_ranges( - &DiffSource::new(b"| b c |", &[0..1, 2..3, 4..5, 6..7]), - &DiffSource::new(b"| b b |", &[0..1, 2..3, 4..5, 6..7]), + (b"| b c |", &[0..1, 2..3, 4..5, 6..7]), + (b"| b b |", &[0..1, 2..3, 4..5, 6..7]), ), vec![(0..1, 0..1), (2..3, 2..3), (6..7, 6..7)] ); assert_eq!( unchanged_ranges( - &DiffSource::new(b"| c c |", &[0..1, 2..3, 4..5, 6..7]), - &DiffSource::new(b"| b c |", &[0..1, 2..3, 4..5, 6..7]), + (b"| c c |", &[0..1, 2..3, 4..5, 6..7]), + (b"| b c |", &[0..1, 2..3, 4..5, 6..7]), ), vec![(0..1, 0..1), (4..5, 4..5), (6..7, 6..7)] ); // "|" matches first, then "a", then "b". assert_eq!( unchanged_ranges( - &DiffSource::new(b"a b c | a", &[0..1, 2..3, 4..5, 6..7, 8..9]), - &DiffSource::new(b"b a b |", &[0..1, 2..3, 4..5, 6..7]), + (b"a b c | a", &[0..1, 2..3, 4..5, 6..7, 8..9]), + (b"b a b |", &[0..1, 2..3, 4..5, 6..7]), ), vec![(0..1, 2..3), (2..3, 4..5), (6..7, 6..7)] ); assert_eq!( unchanged_ranges( - &DiffSource::new(b"| b a b", &[0..1, 2..3, 4..5, 6..7]), - &DiffSource::new(b"a | a b c", &[0..1, 2..3, 4..5, 6..7, 8..9]), + (b"| b a b", &[0..1, 2..3, 4..5, 6..7]), + (b"a | a b c", &[0..1, 2..3, 4..5, 6..7, 8..9]), ), vec![(0..1, 2..3), (4..5, 4..5), (6..7, 6..7)] );