diff --git a/lib/src/diff.rs b/lib/src/diff.rs index 373a303dd2..d9b7b5aeff 100644 --- a/lib/src/diff.rs +++ b/lib/src/diff.rs @@ -16,7 +16,6 @@ use std::cmp::Ordering; use std::collections::BTreeMap; -use std::collections::HashMap; use std::hash::BuildHasher; use std::hash::Hash; use std::hash::Hasher; @@ -26,6 +25,7 @@ use std::ops::Range; use std::slice; use bstr::BStr; +use hashbrown::HashTable; use itertools::Itertools; pub fn find_line_ranges(text: &[u8]) -> Vec> { @@ -178,7 +178,6 @@ struct WordComparator { hash_builder: S, } -#[allow(unused)] // TODO impl WordComparator { fn new(compare: C) -> Self { WordComparator { @@ -189,7 +188,6 @@ impl WordComparator { } } -#[allow(unused)] // TODO impl WordComparator { fn eq(&self, left: &[u8], right: &[u8]) -> bool { self.compare.eq(left, right) @@ -241,15 +239,25 @@ impl<'input, 'aux> DiffSource<'input, 'aux> { } struct Histogram<'input> { - word_to_positions: HashMap<&'input BStr, Vec>, + word_to_positions: HashTable>, } +type HistogramEntry<'input> = (&'input BStr, Vec); + impl<'input> Histogram<'input> { - fn calculate(source: &DiffSource<'input, '_>, max_occurrences: usize) -> Self { - let mut word_to_positions: HashMap<&BStr, Vec> = HashMap::new(); + fn calculate( + source: &DiffSource<'input, '_>, + comp: &WordComparator, + max_occurrences: usize, + ) -> Self { + let mut word_to_positions: HashTable = HashTable::new(); for (i, range) in source.ranges.iter().enumerate() { let word = &source.text[range.clone()]; - let positions = word_to_positions.entry(word).or_default(); + let hash = comp.hash_one(word); + let (_, positions) = word_to_positions + .entry(hash, |(w, _)| comp.eq(w, word), |(w, _)| comp.hash_one(w)) + .or_insert_with(|| (word, vec![])) + .into_mut(); // Allow one more than max_occurrences, so we can later skip those with more // than max_occurrences if positions.len() <= max_occurrences { @@ -259,14 +267,27 @@ impl<'input> Histogram<'input> { Histogram { word_to_positions } } - fn build_count_to_entries(&self) -> BTreeMap)>> { + fn build_count_to_entries(&self) -> BTreeMap>> { let mut count_to_entries: BTreeMap> = BTreeMap::new(); - for (word, positions) in &self.word_to_positions { + for entry in &self.word_to_positions { + let (_, positions) = entry; let entries = count_to_entries.entry(positions.len()).or_default(); - entries.push((*word, positions)); + entries.push(entry); } count_to_entries } + + fn positions_by_word( + &self, + word: &BStr, + comp: &WordComparator, + ) -> Option<&[WordPosition]> { + let hash = comp.hash_one(word); + let (_, positions) = self + .word_to_positions + .find(hash, |(w, _)| comp.eq(w, word))?; + Some(positions) + } } /// Finds the LCS given a array where the value of `input[i]` indicates that @@ -327,10 +348,11 @@ fn find_lcs(input: &[usize]) -> Vec<(usize, usize)> { /// Finds unchanged word (or token) positions among the ones given as /// arguments. The data between those words is ignored. -fn collect_unchanged_words( +fn collect_unchanged_words( found_positions: &mut Vec<(WordPosition, WordPosition)>, left: &DiffSource, right: &DiffSource, + comp: &WordComparator, ) { if left.ranges.is_empty() || right.ranges.is_empty() { return; @@ -338,21 +360,21 @@ fn collect_unchanged_words( // Prioritize LCS-based algorithm than leading/trailing matches let old_len = found_positions.len(); - collect_unchanged_words_lcs(found_positions, left, right); + collect_unchanged_words_lcs(found_positions, left, right, comp); if found_positions.len() != old_len { return; } // Trim leading common ranges (i.e. grow previous unchanged region) let common_leading_len = iter::zip(left.ranges, right.ranges) - .take_while(|&(l, r)| left.text[l.clone()] == right.text[r.clone()]) + .take_while(|&(l, r)| comp.eq(&left.text[l.clone()], &right.text[r.clone()])) .count(); let left_ranges = &left.ranges[common_leading_len..]; let right_ranges = &right.ranges[common_leading_len..]; // Trim trailing common ranges (i.e. grow next unchanged region) let common_trailing_len = iter::zip(left_ranges.iter().rev(), right_ranges.iter().rev()) - .take_while(|&(l, r)| left.text[l.clone()] == right.text[r.clone()]) + .take_while(|&(l, r)| comp.eq(&left.text[l.clone()], &right.text[r.clone()])) .count(); found_positions.extend(itertools::chain( @@ -371,19 +393,20 @@ fn collect_unchanged_words( )); } -fn collect_unchanged_words_lcs( +fn collect_unchanged_words_lcs( found_positions: &mut Vec<(WordPosition, WordPosition)>, left: &DiffSource, right: &DiffSource, + comp: &WordComparator, ) { let max_occurrences = 100; - let left_histogram = Histogram::calculate(left, max_occurrences); + let left_histogram = Histogram::calculate(left, comp, max_occurrences); let left_count_to_entries = left_histogram.build_count_to_entries(); if *left_count_to_entries.keys().next().unwrap() > max_occurrences { // If there are very many occurrences of all words, then we just give up. return; } - let right_histogram = Histogram::calculate(right, max_occurrences); + let right_histogram = Histogram::calculate(right, comp, max_occurrences); // Look for words with few occurrences in `left` (could equally well have picked // `right`?). If any of them also occur in `right`, then we add the words to // the LCS. @@ -392,7 +415,7 @@ fn collect_unchanged_words_lcs( let mut both_positions = left_entries .iter() .filter_map(|&(word, left_positions)| { - let right_positions = right_histogram.word_to_positions.get(word)?; + let right_positions = right_histogram.positions_by_word(word, comp)?; (left_positions.len() == right_positions.len()) .then_some((left_positions, right_positions)) }) @@ -436,6 +459,7 @@ fn collect_unchanged_words_lcs( found_positions, &left.narrowed(previous_left_position..left_position), &right.narrowed(previous_right_position..right_position), + comp, ); found_positions.push(( left.map_to_global(left_position), @@ -449,6 +473,7 @@ fn collect_unchanged_words_lcs( found_positions, &left.narrowed(previous_left_position..WordPosition(left.ranges.len())), &right.narrowed(previous_right_position..WordPosition(right.ranges.len())), + comp, ); } @@ -522,6 +547,7 @@ impl<'input> Diff<'input> { pub fn for_tokenizer + ?Sized + 'input>( inputs: impl IntoIterator, tokenizer: impl Fn(&[u8]) -> Vec>, + compare: impl CompareBytes, ) -> Self { let mut inputs = inputs.into_iter().map(BStr::new); let base_input = inputs.next().expect("inputs must not be empty"); @@ -530,7 +556,9 @@ impl<'input> Diff<'input> { let base_token_ranges: Vec>; let other_token_ranges: Vec>>; // No need to tokenize if one of the inputs is empty. Non-empty inputs - // are all different. + // are all different as long as the tokenizer emits non-empty ranges. + // This means "" and " " are different even if the compare function is + // ignore-whitespace. They are tokenized as [] and [" "] respectively. if base_input.is_empty() || other_inputs.iter().any(|input| input.is_empty()) { base_token_ranges = vec![]; other_token_ranges = iter::repeat(vec![]).take(other_inputs.len()).collect(); @@ -546,6 +574,7 @@ impl<'input> Diff<'input> { other_inputs, &base_token_ranges, &other_token_ranges, + compare, ) } @@ -554,8 +583,10 @@ impl<'input> Diff<'input> { other_inputs: Vec<&'input BStr>, base_token_ranges: &[Range], other_token_ranges: &[Vec>], + compare: impl CompareBytes, ) -> Self { assert_eq!(other_inputs.len(), other_token_ranges.len()); + let comp = WordComparator::new(compare); let base_source = DiffSource::new(base_input, base_token_ranges); let other_sources = iter::zip(&other_inputs, other_token_ranges) .map(|(input, token_ranges)| DiffSource::new(input, token_ranges)) @@ -574,7 +605,12 @@ impl<'input> Diff<'input> { // found ranges with the ranges in the diff. [first_other_source, tail_other_sources @ ..] => { let mut first_positions = Vec::new(); - collect_unchanged_words(&mut first_positions, &base_source, first_other_source); + collect_unchanged_words( + &mut first_positions, + &base_source, + first_other_source, + &comp, + ); if tail_other_sources.is_empty() { first_positions .iter() @@ -596,7 +632,12 @@ impl<'input> Diff<'input> { first_positions, |current_positions, other_source| { let mut new_positions = Vec::new(); - collect_unchanged_words(&mut new_positions, &base_source, other_source); + collect_unchanged_words( + &mut new_positions, + &base_source, + other_source, + &comp, + ); intersect_unchanged_words(current_positions, &new_positions) }, ); @@ -635,14 +676,14 @@ impl<'input> Diff<'input> { pub fn unrefined + ?Sized + 'input>( inputs: impl IntoIterator, ) -> Self { - Diff::for_tokenizer(inputs, |_| vec![]) + Diff::for_tokenizer(inputs, |_| vec![], CompareBytesExactly) } /// Compares `inputs` line by line. pub fn by_line + ?Sized + 'input>( inputs: impl IntoIterator, ) -> Self { - Diff::for_tokenizer(inputs, find_line_ranges) + Diff::for_tokenizer(inputs, find_line_ranges, CompareBytesExactly) } /// Compares `inputs` word by word. @@ -652,8 +693,8 @@ impl<'input> Diff<'input> { pub fn by_word + ?Sized + 'input>( inputs: impl IntoIterator, ) -> Self { - let mut diff = Diff::for_tokenizer(inputs, find_word_ranges); - diff.refine_changed_regions(find_nonword_ranges); + let mut diff = Diff::for_tokenizer(inputs, find_word_ranges, CompareBytesExactly); + diff.refine_changed_regions(find_nonword_ranges, CompareBytesExactly); diff } @@ -684,7 +725,11 @@ impl<'input> Diff<'input> { /// Uses the given tokenizer to split the changed regions into smaller /// regions. Then tries to finds unchanged regions among them. - pub fn refine_changed_regions(&mut self, tokenizer: impl Fn(&[u8]) -> Vec>) { + pub fn refine_changed_regions( + &mut self, + tokenizer: impl Fn(&[u8]) -> Vec>, + compare: impl CompareBytes, + ) { let mut previous = UnchangedRange { base: 0..0, others: vec![0..0; self.other_inputs.len()], @@ -696,7 +741,7 @@ impl<'input> Diff<'input> { // offsets to be valid in the context of the larger Diff instance // (`self`). let refined_diff = - Diff::for_tokenizer(self.hunk_between(&previous, current), &tokenizer); + Diff::for_tokenizer(self.hunk_between(&previous, current), &tokenizer, &compare); for refined in &refined_diff.unchanged_regions { let new_base_start = refined.base.start + previous.base.end; let new_base_end = refined.base.end + previous.base.end; @@ -811,9 +856,9 @@ impl<'diff, 'input> Iterator for DiffHunkIterator<'diff, 'input> { pub fn diff<'a, T: AsRef<[u8]> + ?Sized + 'a>( inputs: impl IntoIterator, ) -> Vec> { - let mut diff = Diff::for_tokenizer(inputs, find_line_ranges); - diff.refine_changed_regions(find_word_ranges); - diff.refine_changed_regions(find_nonword_ranges); + let mut diff = Diff::for_tokenizer(inputs, find_line_ranges, CompareBytesExactly); + diff.refine_changed_regions(find_word_ranges, CompareBytesExactly); + diff.refine_changed_regions(find_nonword_ranges, CompareBytesExactly); diff.hunks().collect() } @@ -991,8 +1036,9 @@ mod tests { left: &DiffSource, right: &DiffSource, ) -> Vec<(Range, Range)> { + let comp = WordComparator::new(CompareBytesExactly); let mut positions = Vec::new(); - collect_unchanged_words(&mut positions, left, right); + collect_unchanged_words(&mut positions, left, right, &comp); positions .into_iter() .map(|(left_pos, right_pos)| (left.range_at(left_pos), right.range_at(right_pos))) @@ -1207,6 +1253,7 @@ mod tests { let diff = Diff::for_tokenizer( ["a\nb\nc\nd\ne\nf\ng", "a\nb\nc\nX\ne\nf\ng"], find_line_ranges, + CompareBytesExactly, ); assert_eq!( diff.hunks().collect_vec(), @@ -1277,6 +1324,60 @@ mod tests { ); } + #[test] + fn test_diff_ignore_all_whitespace() { + fn diff(inputs: [&str; 2]) -> Vec> { + let diff = + Diff::for_tokenizer(inputs, find_line_ranges, CompareBytesIgnoreAllWhitespace); + diff.hunks().collect() + } + + assert_eq!(diff(["", "\n"]), vec![DiffHunk::different(["", "\n"])]); + assert_eq!(diff(["a\n", " a\r\n"]), vec![DiffHunk::matching("a\n")]); + assert_eq!( + diff(["a\n", " a\nb"]), + vec![DiffHunk::matching("a\n"), DiffHunk::different(["", "b"])] + ); + + // No LCS matches, so trim leading/trailing common lines + assert_eq!( + diff(["a\nc\n", " a\n a\n"]), + vec![ + DiffHunk::matching("a\n"), + DiffHunk::different(["c\n", " a\n"]), + ] + ); + assert_eq!( + diff(["c\na\n", " a\n a\n"]), + vec![ + DiffHunk::different(["c\n", " a\n"]), + DiffHunk::matching("a\n"), + ] + ); + } + + #[test] + fn test_diff_ignore_whitespace_amount() { + fn diff(inputs: [&str; 2]) -> Vec> { + let diff = + Diff::for_tokenizer(inputs, find_line_ranges, CompareBytesIgnoreWhitespaceAmount); + diff.hunks().collect() + } + + assert_eq!(diff(["", "\n"]), vec![DiffHunk::different(["", "\n"])]); + // whitespace at line end is ignored + assert_eq!(diff(["a\n", "a\r\n"]), vec![DiffHunk::matching("a\n")]); + // but whitespace at line start isn't + assert_eq!( + diff(["a\n", " a\n"]), + vec![DiffHunk::different(["a\n", " a\n"])] + ); + assert_eq!( + diff(["a\n", "a \nb"]), + vec![DiffHunk::matching("a\n"), DiffHunk::different(["", "b"])] + ); + } + #[test] fn test_diff_real_case_write_fmt() { // This is from src/ui.rs in commit f44d246e3f88 in this repo. It highlights the