From c6bb019d41e64ec88872d7677f70186472522b09 Mon Sep 17 00:00:00 2001 From: Yuya Nishihara Date: Thu, 21 Nov 2024 14:10:55 +0900 Subject: [PATCH] diff: optimize allocation of histogram entries for unique words MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ``` group new old ----- --- --- bench_diff_git_git_read_tree_c 1.00 34.5±0.26µs 1.32 45.7±0.11µs bench_diff_lines/modified/10k 1.00 28.2±0.10ms 1.19 33.5±0.69ms bench_diff_lines/modified/1k 1.00 2.6±0.01ms 1.15 3.0±0.01ms bench_diff_lines/reversed/10k 1.00 21.5±0.22ms 1.08 23.3±0.18ms bench_diff_lines/reversed/1k 1.00 364.8±11.96µs 1.22 445.1±8.99µs bench_diff_lines/unchanged/10k 1.00 1761.3±13.85µs 1.66 2.9±0.07ms bench_diff_lines/unchanged/1k 1.00 163.6±1.25µs 1.47 240.7±2.72µs ``` ``` % hyperfine --sort command --warmup 3 --runs 5 -L bin jj-0,jj-1 \ 'target/release-with-debug/{bin} --ignore-working-copy \ file annotate lib/src/revset.rs' Benchmark 1: target/release-with-debug/jj-0 .. Time (mean ± σ): 1.144 s ± 0.011 s [User: 1.088 s, System: 0.053 s] Range (min … max): 1.131 s … 1.159 s 5 runs Benchmark 2: target/release-with-debug/jj-1 .. Time (mean ± σ): 1.026 s ± 0.008 s [User: 0.975 s, System: 0.048 s] Range (min … max): 1.015 s … 1.035 s 5 runs ``` --- lib/src/diff.rs | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/lib/src/diff.rs b/lib/src/diff.rs index 54558ed882..4cb3f0a907 100644 --- a/lib/src/diff.rs +++ b/lib/src/diff.rs @@ -307,7 +307,9 @@ struct Histogram<'input> { word_to_positions: HashTable>, } -type HistogramEntry<'input> = (HashedWord<'input>, Vec); +// Many of the words are unique. We can inline up to 2 word positions (16 bytes +// on 64-bit platform) in SmallVec for free. +type HistogramEntry<'input> = (HashedWord<'input>, SmallVec<[LocalWordPosition; 2]>); impl<'input> Histogram<'input> { fn calculate( @@ -317,19 +319,21 @@ impl<'input> Histogram<'input> { ) -> Self { let mut word_to_positions: HashTable = HashTable::new(); for (i, word) in source.hashed_words().enumerate() { - let (_, positions) = word_to_positions + let pos = LocalWordPosition(i); + word_to_positions .entry( word.hash, |(w, _)| comp.eq(w.text, word.text), |(w, _)| w.hash, ) - .or_insert_with(|| (word, vec![])) - .into_mut(); - // Allow one more than max_occurrences, so we can later skip those with more - // than max_occurrences - if positions.len() <= max_occurrences { - positions.push(LocalWordPosition(i)); - } + .and_modify(|(_, positions)| { + // Allow one more than max_occurrences, so we can later skip + // those with more than max_occurrences + if positions.len() <= max_occurrences { + positions.push(pos); + } + }) + .or_insert_with(|| (word, smallvec![pos])); } Histogram { word_to_positions } }