From c6bb019d41e64ec88872d7677f70186472522b09 Mon Sep 17 00:00:00 2001
From: Yuya Nishihara <yuya@tcha.org>
Date: Thu, 21 Nov 2024 14:10:55 +0900
Subject: [PATCH] diff: optimize allocation of histogram entries for unique
 words
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

```
group                             new                     old
-----                             ---                     ---
bench_diff_git_git_read_tree_c    1.00     34.5±0.26µs    1.32     45.7±0.11µs
bench_diff_lines/modified/10k     1.00     28.2±0.10ms    1.19     33.5±0.69ms
bench_diff_lines/modified/1k      1.00      2.6±0.01ms    1.15      3.0±0.01ms
bench_diff_lines/reversed/10k     1.00     21.5±0.22ms    1.08     23.3±0.18ms
bench_diff_lines/reversed/1k      1.00   364.8±11.96µs    1.22    445.1±8.99µs
bench_diff_lines/unchanged/10k    1.00  1761.3±13.85µs    1.66      2.9±0.07ms
bench_diff_lines/unchanged/1k     1.00    163.6±1.25µs    1.47    240.7±2.72µs
```

```
% hyperfine --sort command --warmup 3 --runs 5 -L bin jj-0,jj-1 \
  'target/release-with-debug/{bin} --ignore-working-copy \
  file annotate lib/src/revset.rs'
Benchmark 1: target/release-with-debug/jj-0 ..
  Time (mean ± σ):      1.144 s ±  0.011 s    [User: 1.088 s, System: 0.053 s]
  Range (min … max):    1.131 s …  1.159 s    5 runs

Benchmark 2: target/release-with-debug/jj-1 ..
  Time (mean ± σ):      1.026 s ±  0.008 s    [User: 0.975 s, System: 0.048 s]
  Range (min … max):    1.015 s …  1.035 s    5 runs
```
---
 lib/src/diff.rs | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)
diff --git a/lib/src/diff.rs b/lib/src/diff.rs
index 54558ed882..4cb3f0a907 100644
--- a/lib/src/diff.rs
+++ b/lib/src/diff.rs
@@ -307,7 +307,9 @@ struct Histogram<'input> {
     word_to_positions: HashTable<HistogramEntry<'input>>,
 }
 
-type HistogramEntry<'input> = (HashedWord<'input>, Vec<LocalWordPosition>);
+// Many of the words are unique. We can inline up to 2 word positions (16 bytes
+// on 64-bit platform) in SmallVec for free.
+type HistogramEntry<'input> = (HashedWord<'input>, SmallVec<[LocalWordPosition; 2]>);
 
 impl<'input> Histogram<'input> {
     fn calculate<C: CompareBytes, S: BuildHasher>(
@@ -317,19 +319,21 @@ impl<'input> Histogram<'input> {
     ) -> Self {
         let mut word_to_positions: HashTable<HistogramEntry> = HashTable::new();
         for (i, word) in source.hashed_words().enumerate() {
-            let (_, positions) = word_to_positions
+            let pos = LocalWordPosition(i);
+            word_to_positions
                 .entry(
                     word.hash,
                     |(w, _)| comp.eq(w.text, word.text),
                     |(w, _)| w.hash,
                 )
-                .or_insert_with(|| (word, vec![]))
-                .into_mut();
-            // Allow one more than max_occurrences, so we can later skip those with more
-            // than max_occurrences
-            if positions.len() <= max_occurrences {
-                positions.push(LocalWordPosition(i));
-            }
+                .and_modify(|(_, positions)| {
+                    // Allow one more than max_occurrences, so we can later skip
+                    // those with more than max_occurrences
+                    if positions.len() <= max_occurrences {
+                        positions.push(pos);
+                    }
+                })
+                .or_insert_with(|| (word, smallvec![pos]));
         }
         Histogram { word_to_positions }
     }