Skip to content

Commit

Permalink
diff: use low-level HashTable in Histogram
Browse files Browse the repository at this point in the history
This change made some diff benches slow, maybe because the generated code
becomes slightly worse due to the added abstraction? I'll revisit the
performance problem later. There are a couple of ways to mitigate it.

```
group                             new                     old
-----                             ---                     ---
bench_diff_git_git_read_tree_c    1.02     61.0±0.23µs    1.00     59.7±0.38µs
bench_diff_lines/modified/10k     1.00     41.6±0.24ms    1.02     42.3±0.22ms
bench_diff_lines/modified/1k      1.00      3.8±0.07ms    1.00      3.8±0.03ms
bench_diff_lines/reversed/10k     1.29     23.4±0.20ms    1.00     18.2±0.26ms
bench_diff_lines/reversed/1k      1.05    517.2±5.55µs    1.00   493.7±59.72µs
bench_diff_lines/unchanged/10k    1.00      3.9±0.10ms    1.08      4.2±0.10ms
bench_diff_lines/unchanged/1k     1.01    356.8±2.33µs    1.00    353.7±1.99µs
```
(I don't get stable results on my noisy machine, so the results would vary.)
  • Loading branch information
yuja committed Oct 4, 2024
1 parent 1f3e1c2 commit 8459d0c
Showing 1 changed file with 132 additions and 31 deletions.
163 changes: 132 additions & 31 deletions lib/src/diff.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@

use std::cmp::Ordering;
use std::collections::BTreeMap;
use std::collections::HashMap;
use std::hash::BuildHasher;
use std::hash::Hash;
use std::hash::Hasher;
Expand All @@ -26,6 +25,7 @@ use std::ops::Range;
use std::slice;

use bstr::BStr;
use hashbrown::HashTable;
use itertools::Itertools;

pub fn find_line_ranges(text: &[u8]) -> Vec<Range<usize>> {
Expand Down Expand Up @@ -188,7 +188,6 @@ struct WordComparator<C, S> {
hash_builder: S,
}

#[allow(unused)] // TODO
impl<C: CompareBytes> WordComparator<C, RandomState> {
fn new(compare: C) -> Self {
WordComparator {
Expand All @@ -199,7 +198,6 @@ impl<C: CompareBytes> WordComparator<C, RandomState> {
}
}

#[allow(unused)] // TODO
impl<C: CompareBytes, S: BuildHasher> WordComparator<C, S> {
fn eq(&self, left: &[u8], right: &[u8]) -> bool {
self.compare.eq(left, right)
Expand Down Expand Up @@ -251,15 +249,25 @@ impl<'input, 'aux> DiffSource<'input, 'aux> {
}

struct Histogram<'input> {
word_to_positions: HashMap<&'input BStr, Vec<WordPosition>>,
word_to_positions: HashTable<HistogramEntry<'input>>,
}

type HistogramEntry<'input> = (&'input BStr, Vec<WordPosition>);

impl<'input> Histogram<'input> {
fn calculate(source: &DiffSource<'input, '_>, max_occurrences: usize) -> Self {
let mut word_to_positions: HashMap<&BStr, Vec<WordPosition>> = HashMap::new();
fn calculate<C: CompareBytes, S: BuildHasher>(
source: &DiffSource<'input, '_>,
comp: &WordComparator<C, S>,
max_occurrences: usize,
) -> Self {
let mut word_to_positions: HashTable<HistogramEntry> = HashTable::new();
for (i, range) in source.ranges.iter().enumerate() {
let word = &source.text[range.clone()];
let positions = word_to_positions.entry(word).or_default();
let hash = comp.hash_one(word);
let (_, positions) = word_to_positions
.entry(hash, |(w, _)| comp.eq(w, word), |(w, _)| comp.hash_one(w))
.or_insert_with(|| (word, vec![]))
.into_mut();
// Allow one more than max_occurrences, so we can later skip those with more
// than max_occurrences
if positions.len() <= max_occurrences {
Expand All @@ -269,14 +277,27 @@ impl<'input> Histogram<'input> {
Histogram { word_to_positions }
}

fn build_count_to_entries(&self) -> BTreeMap<usize, Vec<(&'input BStr, &Vec<WordPosition>)>> {
fn build_count_to_entries(&self) -> BTreeMap<usize, Vec<&HistogramEntry<'input>>> {
let mut count_to_entries: BTreeMap<usize, Vec<_>> = BTreeMap::new();
for (word, positions) in &self.word_to_positions {
for entry in &self.word_to_positions {
let (_, positions) = entry;
let entries = count_to_entries.entry(positions.len()).or_default();
entries.push((*word, positions));
entries.push(entry);
}
count_to_entries
}

fn positions_by_word<C: CompareBytes, S: BuildHasher>(
&self,
word: &BStr,
comp: &WordComparator<C, S>,
) -> Option<&[WordPosition]> {
let hash = comp.hash_one(word);
let (_, positions) = self
.word_to_positions
.find(hash, |(w, _)| comp.eq(w, word))?;
Some(positions)
}
}

/// Finds the LCS given a array where the value of `input[i]` indicates that
Expand Down Expand Up @@ -337,32 +358,33 @@ fn find_lcs(input: &[usize]) -> Vec<(usize, usize)> {

/// Finds unchanged word (or token) positions among the ones given as
/// arguments. The data between those words is ignored.
fn collect_unchanged_words(
fn collect_unchanged_words<C: CompareBytes, S: BuildHasher>(
found_positions: &mut Vec<(WordPosition, WordPosition)>,
left: &DiffSource,
right: &DiffSource,
comp: &WordComparator<C, S>,
) {
if left.ranges.is_empty() || right.ranges.is_empty() {
return;
}

// Prioritize LCS-based algorithm than leading/trailing matches
let old_len = found_positions.len();
collect_unchanged_words_lcs(found_positions, left, right);
collect_unchanged_words_lcs(found_positions, left, right, comp);
if found_positions.len() != old_len {
return;
}

// Trim leading common ranges (i.e. grow previous unchanged region)
let common_leading_len = iter::zip(left.ranges, right.ranges)
.take_while(|&(l, r)| left.text[l.clone()] == right.text[r.clone()])
.take_while(|&(l, r)| comp.eq(&left.text[l.clone()], &right.text[r.clone()]))
.count();
let left_ranges = &left.ranges[common_leading_len..];
let right_ranges = &right.ranges[common_leading_len..];

// Trim trailing common ranges (i.e. grow next unchanged region)
let common_trailing_len = iter::zip(left_ranges.iter().rev(), right_ranges.iter().rev())
.take_while(|&(l, r)| left.text[l.clone()] == right.text[r.clone()])
.take_while(|&(l, r)| comp.eq(&left.text[l.clone()], &right.text[r.clone()]))
.count();

found_positions.extend(itertools::chain(
Expand All @@ -381,19 +403,20 @@ fn collect_unchanged_words(
));
}

fn collect_unchanged_words_lcs(
fn collect_unchanged_words_lcs<C: CompareBytes, S: BuildHasher>(
found_positions: &mut Vec<(WordPosition, WordPosition)>,
left: &DiffSource,
right: &DiffSource,
comp: &WordComparator<C, S>,
) {
let max_occurrences = 100;
let left_histogram = Histogram::calculate(left, max_occurrences);
let left_histogram = Histogram::calculate(left, comp, max_occurrences);
let left_count_to_entries = left_histogram.build_count_to_entries();
if *left_count_to_entries.keys().next().unwrap() > max_occurrences {
// If there are very many occurrences of all words, then we just give up.
return;
}
let right_histogram = Histogram::calculate(right, max_occurrences);
let right_histogram = Histogram::calculate(right, comp, max_occurrences);
// Look for words with few occurrences in `left` (could equally well have picked
// `right`?). If any of them also occur in `right`, then we add the words to
// the LCS.
Expand All @@ -402,7 +425,7 @@ fn collect_unchanged_words_lcs(
let mut both_positions = left_entries
.iter()
.filter_map(|&(word, left_positions)| {
let right_positions = right_histogram.word_to_positions.get(word)?;
let right_positions = right_histogram.positions_by_word(word, comp)?;
(left_positions.len() == right_positions.len())
.then_some((left_positions, right_positions))
})
Expand Down Expand Up @@ -446,6 +469,7 @@ fn collect_unchanged_words_lcs(
found_positions,
&left.narrowed(previous_left_position..left_position),
&right.narrowed(previous_right_position..right_position),
comp,
);
found_positions.push((
left.map_to_global(left_position),
Expand All @@ -459,6 +483,7 @@ fn collect_unchanged_words_lcs(
found_positions,
&left.narrowed(previous_left_position..WordPosition(left.ranges.len())),
&right.narrowed(previous_right_position..WordPosition(right.ranges.len())),
comp,
);
}

Expand Down Expand Up @@ -532,6 +557,7 @@ impl<'input> Diff<'input> {
pub fn for_tokenizer<T: AsRef<[u8]> + ?Sized + 'input>(
inputs: impl IntoIterator<Item = &'input T>,
tokenizer: impl Fn(&[u8]) -> Vec<Range<usize>>,
compare: impl CompareBytes,
) -> Self {
let mut inputs = inputs.into_iter().map(BStr::new);
let base_input = inputs.next().expect("inputs must not be empty");
Expand All @@ -540,7 +566,9 @@ impl<'input> Diff<'input> {
let base_token_ranges: Vec<Range<usize>>;
let other_token_ranges: Vec<Vec<Range<usize>>>;
// No need to tokenize if one of the inputs is empty. Non-empty inputs
// are all different.
// are all different as long as the tokenizer emits non-empty ranges.
// This means "" and " " are different even if the compare function is
// ignore-whitespace. They are tokenized as [] and [" "] respectively.
if base_input.is_empty() || other_inputs.iter().any(|input| input.is_empty()) {
base_token_ranges = vec![];
other_token_ranges = iter::repeat(vec![]).take(other_inputs.len()).collect();
Expand All @@ -556,6 +584,7 @@ impl<'input> Diff<'input> {
other_inputs,
&base_token_ranges,
&other_token_ranges,
compare,
)
}

Expand All @@ -564,8 +593,10 @@ impl<'input> Diff<'input> {
other_inputs: Vec<&'input BStr>,
base_token_ranges: &[Range<usize>],
other_token_ranges: &[Vec<Range<usize>>],
compare: impl CompareBytes,
) -> Self {
assert_eq!(other_inputs.len(), other_token_ranges.len());
let comp = WordComparator::new(compare);
let base_source = DiffSource::new(base_input, base_token_ranges);
let other_sources = iter::zip(&other_inputs, other_token_ranges)
.map(|(input, token_ranges)| DiffSource::new(input, token_ranges))
Expand All @@ -584,7 +615,12 @@ impl<'input> Diff<'input> {
// found ranges with the ranges in the diff.
[first_other_source, tail_other_sources @ ..] => {
let mut first_positions = Vec::new();
collect_unchanged_words(&mut first_positions, &base_source, first_other_source);
collect_unchanged_words(
&mut first_positions,
&base_source,
first_other_source,
&comp,
);
if tail_other_sources.is_empty() {
first_positions
.iter()
Expand All @@ -606,7 +642,12 @@ impl<'input> Diff<'input> {
first_positions,
|current_positions, other_source| {
let mut new_positions = Vec::new();
collect_unchanged_words(&mut new_positions, &base_source, other_source);
collect_unchanged_words(
&mut new_positions,
&base_source,
other_source,
&comp,
);
intersect_unchanged_words(current_positions, &new_positions)
},
);
Expand Down Expand Up @@ -645,14 +686,14 @@ impl<'input> Diff<'input> {
pub fn unrefined<T: AsRef<[u8]> + ?Sized + 'input>(
inputs: impl IntoIterator<Item = &'input T>,
) -> Self {
Diff::for_tokenizer(inputs, |_| vec![])
Diff::for_tokenizer(inputs, |_| vec![], CompareBytesExactly)
}

/// Compares `inputs` line by line.
pub fn by_line<T: AsRef<[u8]> + ?Sized + 'input>(
inputs: impl IntoIterator<Item = &'input T>,
) -> Self {
Diff::for_tokenizer(inputs, find_line_ranges)
Diff::for_tokenizer(inputs, find_line_ranges, CompareBytesExactly)
}

/// Compares `inputs` word by word.
Expand All @@ -662,8 +703,8 @@ impl<'input> Diff<'input> {
pub fn by_word<T: AsRef<[u8]> + ?Sized + 'input>(
inputs: impl IntoIterator<Item = &'input T>,
) -> Self {
let mut diff = Diff::for_tokenizer(inputs, find_word_ranges);
diff.refine_changed_regions(find_nonword_ranges);
let mut diff = Diff::for_tokenizer(inputs, find_word_ranges, CompareBytesExactly);
diff.refine_changed_regions(find_nonword_ranges, CompareBytesExactly);
diff
}

Expand Down Expand Up @@ -694,7 +735,11 @@ impl<'input> Diff<'input> {

/// Uses the given tokenizer to split the changed regions into smaller
/// regions. Then tries to finds unchanged regions among them.
pub fn refine_changed_regions(&mut self, tokenizer: impl Fn(&[u8]) -> Vec<Range<usize>>) {
pub fn refine_changed_regions(
&mut self,
tokenizer: impl Fn(&[u8]) -> Vec<Range<usize>>,
compare: impl CompareBytes,
) {
let mut previous = UnchangedRange {
base: 0..0,
others: vec![0..0; self.other_inputs.len()],
Expand All @@ -706,7 +751,7 @@ impl<'input> Diff<'input> {
// offsets to be valid in the context of the larger Diff instance
// (`self`).
let refined_diff =
Diff::for_tokenizer(self.hunk_between(&previous, current), &tokenizer);
Diff::for_tokenizer(self.hunk_between(&previous, current), &tokenizer, &compare);
for refined in &refined_diff.unchanged_regions {
let new_base_start = refined.base.start + previous.base.end;
let new_base_end = refined.base.end + previous.base.end;
Expand Down Expand Up @@ -821,9 +866,9 @@ impl<'diff, 'input> Iterator for DiffHunkIterator<'diff, 'input> {
pub fn diff<'a, T: AsRef<[u8]> + ?Sized + 'a>(
inputs: impl IntoIterator<Item = &'a T>,
) -> Vec<DiffHunk<'a>> {
let mut diff = Diff::for_tokenizer(inputs, find_line_ranges);
diff.refine_changed_regions(find_word_ranges);
diff.refine_changed_regions(find_nonword_ranges);
let mut diff = Diff::for_tokenizer(inputs, find_line_ranges, CompareBytesExactly);
diff.refine_changed_regions(find_word_ranges, CompareBytesExactly);
diff.refine_changed_regions(find_nonword_ranges, CompareBytesExactly);
diff.hunks().collect()
}

Expand Down Expand Up @@ -1001,8 +1046,9 @@ mod tests {
left: &DiffSource,
right: &DiffSource,
) -> Vec<(Range<usize>, Range<usize>)> {
let comp = WordComparator::new(CompareBytesExactly);
let mut positions = Vec::new();
collect_unchanged_words(&mut positions, left, right);
collect_unchanged_words(&mut positions, left, right, &comp);
positions
.into_iter()
.map(|(left_pos, right_pos)| (left.range_at(left_pos), right.range_at(right_pos)))
Expand Down Expand Up @@ -1217,6 +1263,7 @@ mod tests {
let diff = Diff::for_tokenizer(
["a\nb\nc\nd\ne\nf\ng", "a\nb\nc\nX\ne\nf\ng"],
find_line_ranges,
CompareBytesExactly,
);
assert_eq!(
diff.hunks().collect_vec(),
Expand Down Expand Up @@ -1287,6 +1334,60 @@ mod tests {
);
}

#[test]
fn test_diff_ignore_all_whitespace() {
fn diff(inputs: [&str; 2]) -> Vec<DiffHunk<'_>> {
let diff =
Diff::for_tokenizer(inputs, find_line_ranges, CompareBytesIgnoreAllWhitespace);
diff.hunks().collect()
}

assert_eq!(diff(["", "\n"]), vec![DiffHunk::different(["", "\n"])]);
assert_eq!(diff(["a\n", " a\r\n"]), vec![DiffHunk::matching("a\n")]);
assert_eq!(
diff(["a\n", " a\nb"]),
vec![DiffHunk::matching("a\n"), DiffHunk::different(["", "b"])]
);

// No LCS matches, so trim leading/trailing common lines
assert_eq!(
diff(["a\nc\n", " a\n a\n"]),
vec![
DiffHunk::matching("a\n"),
DiffHunk::different(["c\n", " a\n"]),
]
);
assert_eq!(
diff(["c\na\n", " a\n a\n"]),
vec![
DiffHunk::different(["c\n", " a\n"]),
DiffHunk::matching("a\n"),
]
);
}

#[test]
fn test_diff_ignore_whitespace_amount() {
fn diff(inputs: [&str; 2]) -> Vec<DiffHunk<'_>> {
let diff =
Diff::for_tokenizer(inputs, find_line_ranges, CompareBytesIgnoreWhitespaceAmount);
diff.hunks().collect()
}

assert_eq!(diff(["", "\n"]), vec![DiffHunk::different(["", "\n"])]);
// whitespace at line end is ignored
assert_eq!(diff(["a\n", "a\r\n"]), vec![DiffHunk::matching("a\n")]);
// but whitespace at line start isn't
assert_eq!(
diff(["a\n", " a\n"]),
vec![DiffHunk::different(["a\n", " a\n"])]
);
assert_eq!(
diff(["a\n", "a \nb"]),
vec![DiffHunk::matching("a\n"), DiffHunk::different(["", "b"])]
);
}

#[test]
fn test_diff_real_case_write_fmt() {
// This is from src/ui.rs in commit f44d246e3f88 in this repo. It highlights the
Expand Down

0 comments on commit 8459d0c

Please sign in to comment.