Skip to content

Commit

Permalink
index: don't store commit ids in sorted lookup table to save disk space
Browse files Browse the repository at this point in the history
This reduces the index file size. In my linux mirror repo containing 1591524
commits, the initial index file shrank from 122MB to 92MB. In theory, this
makes commit id lookup slow because of additional indirection and cache miss,
but I don't see significant difference. In mid-size repo, this is actually a
bit faster thanks to smaller index reads.

Alternatively, the commit id field could be removed from the CommitGraphEntry,
but doing that would introduce indirect lookup there, and the index disk size
isn't as small as this change.

- jj-0 baseline                         122MB
- jj-1 shrink CommitLookupEntry (this)   92MB
- jj-3 shrink CommitGraphEntry           98MB

Mid-size repo, "log" with default template
```
% hyperfine --sort command --warmup 3 --runs 20 -L bin jj-0,jj-1,jj-2,jj-3 \
  -s "target/release-with-debug/{bin} -R ~/mirrors/linux debug reindex" \
  "target/release-with-debug/{bin} -R ~/mirrors/linux --ignore-working-copy log -r.. -l100 --config-toml='revsets.short-prefixes=\"\"'"
Benchmark 1: target/release-with-debug/jj-0 -R ~/mirrors/linux --ignore-working-copy log -r.. -l100 --config-toml='revsets.short-prefixes=""'
  Time (mean ± σ):     177.7 ms ±  12.9 ms    [User: 96.3 ms, System: 81.5 ms]
  Range (min … max):   156.8 ms … 191.2 ms    20 runs

Benchmark 2: target/release-with-debug/jj-1 -R ~/mirrors/linux --ignore-working-copy log -r.. -l100 --config-toml='revsets.short-prefixes=""'
  Time (mean ± σ):     169.8 ms ±  13.8 ms    [User: 93.3 ms, System: 76.6 ms]
  Range (min … max):   151.1 ms … 191.5 ms    20 runs

Benchmark 4: target/release-with-debug/jj-3 -R ~/mirrors/linux --ignore-working-copy log -r.. -l100 --config-toml='revsets.short-prefixes=""'
  Time (mean ± σ):     170.3 ms ±  13.4 ms    [User: 90.1 ms, System: 79.7 ms]
  Range (min … max):   154.8 ms … 186.2 ms    20 runs

Relative speed comparison
        1.05 ±  0.11  target/release-with-debug/jj-0 -R ~/mirrors/linux --ignore-working-copy log -r.. -l100 --config-toml='revsets.short-prefixes=""'
        1.00          target/release-with-debug/jj-1 -R ~/mirrors/linux --ignore-working-copy log -r.. -l100 --config-toml='revsets.short-prefixes=""'
        1.00 ±  0.11  target/release-with-debug/jj-3 -R ~/mirrors/linux --ignore-working-copy log -r.. -l100 --config-toml='revsets.short-prefixes=""'
```

Small repo, "log" thousands of commits with -T"commit_id.shortest()"
```
% hyperfine --sort command --warmup 3 --runs 100 -L bin jj-0,jj-1,jj-2,jj-3 \
  -s "target/release-with-debug/{bin} -R ~/mirrors/git debug reindex" \
  "target/release-with-debug/{bin} -R ~/mirrors/git --ignore-working-copy log -r.. -l5000 -T'commit_id.shortest()' --config-toml='revsets.short-prefixes=\"\"'"
Benchmark 1: target/release-with-debug/jj-0 -R ~/mirrors/git --ignore-working-copy log -r.. -l5000 -T'commit_id.shortest()' --config-toml='revsets.short-prefixes=""'
  Time (mean ± σ):     179.3 ms ±  12.8 ms    [User: 149.7 ms, System: 29.6 ms]
  Range (min … max):   155.2 ms … 191.0 ms    100 runs

Benchmark 2: target/release-with-debug/jj-1 -R ~/mirrors/git --ignore-working-copy log -r.. -l5000 -T'commit_id.shortest()' --config-toml='revsets.short-prefixes=""'
  Time (mean ± σ):     179.1 ms ±  13.7 ms    [User: 148.5 ms, System: 30.5 ms]
  Range (min … max):   157.2 ms … 196.7 ms    100 runs

Benchmark 4: target/release-with-debug/jj-3 -R ~/mirrors/git --ignore-working-copy log -r.. -l5000 -T'commit_id.shortest()' --config-toml='revsets.short-prefixes=""'
  Time (mean ± σ):     178.2 ms ±  13.6 ms    [User: 148.7 ms, System: 29.6 ms]
  Range (min … max):   156.5 ms … 191.7 ms    100 runs

Relative speed comparison
        1.01 ±  0.11  target/release-with-debug/jj-0 -R ~/mirrors/git --ignore-working-copy log -r.. -l5000 -T'commit_id.shortest()' --config-toml='revsets.short-prefixes=""'
        1.01 ±  0.11  target/release-with-debug/jj-1 -R ~/mirrors/git --ignore-working-copy log -r.. -l5000 -T'commit_id.shortest()' --config-toml='revsets.short-prefixes=""'
        1.01 ±  0.11  target/release-with-debug/jj-3 -R ~/mirrors/git --ignore-working-copy log -r.. -l5000 -T'commit_id.shortest()' --config-toml='revsets.short-prefixes=""'
```
  • Loading branch information
yuja committed Feb 18, 2024
1 parent a1b16c5 commit c146b5f
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 42 deletions.
3 changes: 1 addition & 2 deletions lib/src/default_index/mutable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -254,8 +254,7 @@ impl MutableIndexSegment {
buf.extend_from_slice(entry.commit_id.as_bytes());
}

for (commit_id, LocalPosition(pos)) in &self.commit_lookup {
buf.extend_from_slice(commit_id.as_bytes());
for LocalPosition(pos) in self.commit_lookup.values() {
buf.extend(pos.to_le_bytes());
}

Expand Down
61 changes: 21 additions & 40 deletions lib/src/default_index/readonly.rs
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ impl ReadonlyIndexLoadError {
}

/// Current format version of the index segment file.
pub(crate) const INDEX_SEGMENT_FILE_FORMAT_VERSION: u32 = 5;
pub(crate) const INDEX_SEGMENT_FILE_FORMAT_VERSION: u32 = 6;

/// If set, the value is stored in the overflow table.
pub(crate) const OVERFLOW_FLAG: u32 = 0x8000_0000;
Expand Down Expand Up @@ -133,33 +133,13 @@ impl CommitGraphEntry<'_> {
u32::from_le_bytes(self.data[12..16].try_into().unwrap())
}

fn commit_id(&self) -> CommitId {
CommitId::from_bytes(&self.data[16..])
}
}

struct CommitLookupEntry<'a> {
data: &'a [u8],
commit_id_length: usize,
}

impl CommitLookupEntry<'_> {
fn size(commit_id_length: usize) -> usize {
commit_id_length + 4
}

fn commit_id(&self) -> CommitId {
CommitId::from_bytes(self.commit_id_bytes())
}

// might be better to add borrowed version of CommitId
fn commit_id_bytes(&self) -> &[u8] {
&self.data[0..self.commit_id_length]
}

fn local_pos(&self) -> LocalPosition {
let pos = u32::from_le_bytes(self.data[self.commit_id_length..][..4].try_into().unwrap());
LocalPosition(pos)
&self.data[16..]
}
}

Expand Down Expand Up @@ -188,7 +168,6 @@ impl CommitLookupEntry<'_> {
/// u32: change id position in the sorted change ids table
/// <commit id length number of bytes>: commit id
/// for each entry, sorted by commit id:
/// <commit id length number of bytes>: commit id
/// u32: local position in the graph entries table
/// for each entry, sorted by change id:
/// <change id length number of bytes>: change id
Expand Down Expand Up @@ -324,8 +303,7 @@ impl ReadonlyIndexSegment {

let commit_graph_entry_size = CommitGraphEntry::size(commit_id_length);
let graph_size = (num_local_commits as usize) * commit_graph_entry_size;
let commit_lookup_entry_size = CommitLookupEntry::size(commit_id_length);
let commit_lookup_size = (num_local_commits as usize) * commit_lookup_entry_size;
let commit_lookup_size = (num_local_commits as usize) * 4;
let change_id_table_size = (num_local_change_ids as usize) * change_id_length;
let change_pos_table_size = (num_local_change_ids as usize) * 4;
let parent_overflow_size = (num_parent_overflow_entries as usize) * 4;
Expand Down Expand Up @@ -389,14 +367,10 @@ impl ReadonlyIndexSegment {
}
}

fn commit_lookup_entry(&self, lookup_pos: u32) -> CommitLookupEntry {
fn commit_lookup_pos(&self, lookup_pos: u32) -> LocalPosition {
let table = &self.data[self.commit_lookup_base..self.change_id_table_base];
let entry_size = CommitLookupEntry::size(self.commit_id_length);
let offset = (lookup_pos as usize) * entry_size;
CommitLookupEntry {
data: &table[offset..][..entry_size],
commit_id_length: self.commit_id_length,
}
let offset = (lookup_pos as usize) * 4;
LocalPosition(u32::from_le_bytes(table[offset..][..4].try_into().unwrap()))
}

fn change_lookup_id(&self, lookup_pos: u32) -> ChangeId {
Expand Down Expand Up @@ -438,7 +412,8 @@ impl ReadonlyIndexSegment {
/// Binary searches commit id by `prefix`. Returns the lookup position.
fn commit_id_byte_prefix_to_lookup_pos(&self, prefix: &[u8]) -> PositionLookupResult {
binary_search_pos_by(self.num_local_commits, |pos| {
let entry = self.commit_lookup_entry(pos);
let local_pos = self.commit_lookup_pos(pos);
let entry = self.graph_entry(local_pos);
entry.commit_id_bytes().cmp(prefix)
})
}
Expand Down Expand Up @@ -470,24 +445,30 @@ impl IndexSegment for ReadonlyIndexSegment {
}

fn commit_id_to_pos(&self, commit_id: &CommitId) -> Option<LocalPosition> {
let lookup_pos = self
.commit_id_byte_prefix_to_lookup_pos(commit_id.as_bytes())
.ok()?;
let entry = self.commit_lookup_entry(lookup_pos);
Some(entry.local_pos())
self.commit_id_byte_prefix_to_lookup_pos(commit_id.as_bytes())
.ok()
.map(|pos| self.commit_lookup_pos(pos))
}

fn resolve_neighbor_commit_ids(
&self,
commit_id: &CommitId,
) -> (Option<CommitId>, Option<CommitId>) {
self.commit_id_byte_prefix_to_lookup_pos(commit_id.as_bytes())
.map_neighbors(|pos| self.commit_lookup_entry(pos).commit_id())
.map_neighbors(|pos| {
let local_pos = self.commit_lookup_pos(pos);
let entry = self.graph_entry(local_pos);
entry.commit_id()
})
}

fn resolve_commit_id_prefix(&self, prefix: &HexPrefix) -> PrefixResolution<CommitId> {
self.commit_id_byte_prefix_to_lookup_pos(prefix.min_prefix_bytes())
.prefix_matches(prefix, |pos| self.commit_lookup_entry(pos).commit_id())
.prefix_matches(prefix, |pos| {
let local_pos = self.commit_lookup_pos(pos);
let entry = self.graph_entry(local_pos);
entry.commit_id()
})
.map(|(id, _)| id)
}

Expand Down

0 comments on commit c146b5f

Please sign in to comment.