Skip to content

Commit

Permalink
index: compact parent encoding, inline up to two parents
Browse files Browse the repository at this point in the history
This saves 4 more bytes per entry, and more importantly, most commit parents
can be resolved with no indirection to the overflow table.

IIRC, Git always inlines the first parent, but that wouldn't be useful in jj
since jj diffs merge commit against the auto-merge parent. The first merge
parent is nothing special.

I'll use a similar encoding in change id sstable, where only one position
will be inlined (to optimize for imported commits.)

Benchmark number measuring the cost of change id index building:
```
% hyperfine --sort command --warmup 3 --runs 20 -L bin jj-0,jj-1 \
  -s "target/release-with-debug/{bin} -R ~/mirrors/linux \
      --ignore-working-copy debug reindex" \
  "target/release-with-debug/{bin} -R ~/mirrors/linux \
    --ignore-working-copy log -r@ --config-toml='revsets.short-prefixes=\"\"'"
Benchmark 1: target/release-with-debug/jj-0 -R ~/mirrors/linux --ignore-working-copy log -r@ --config-toml='revsets.short-prefixes=""'
  Time (mean ± σ):     342.9 ms ±  14.5 ms    [User: 202.4 ms, System: 140.6 ms]
  Range (min … max):   326.6 ms … 360.6 ms    20 runs

Benchmark 2: target/release-with-debug/jj-1 -R ~/mirrors/linux --ignore-working-copy log -r@ --config-toml='revsets.short-prefixes=""'
  Time (mean ± σ):     325.0 ms ±  13.6 ms    [User: 196.2 ms, System: 128.8 ms]
  Range (min … max):   311.6 ms … 343.2 ms    20 runs

Relative speed comparison
        1.06 ±  0.06  target/release-with-debug/jj-0 -R ~/mirrors/linux --ignore-working-copy log -r@ --config-toml='revsets.short-prefixes=""'
        1.00          target/release-with-debug/jj-1 -R ~/mirrors/linux --ignore-working-copy log -r@ --config-toml='revsets.short-prefixes=""'
```
  • Loading branch information
yuja committed Feb 13, 2024
1 parent 8177d84 commit 60883a3
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 45 deletions.
40 changes: 25 additions & 15 deletions lib/src/default_index/mutable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ use tempfile::NamedTempFile;
use super::composite::{AsCompositeIndex, ChangeIdIndexImpl, CompositeIndex, IndexSegment};
use super::entry::{IndexPosition, LocalPosition, SmallIndexPositionsVec};
use super::readonly::{
DefaultReadonlyIndex, ReadonlyIndexSegment, INDEX_SEGMENT_FILE_FORMAT_VERSION,
DefaultReadonlyIndex, ReadonlyIndexSegment, INDEX_SEGMENT_FILE_FORMAT_VERSION, OVERFLOW_FLAG,
};
use crate::backend::{ChangeId, CommitId};
use crate::commit::Commit;
Expand Down Expand Up @@ -198,22 +198,32 @@ impl MutableIndexSegment {
for entry in &self.graph {
buf.extend(entry.generation_number.to_le_bytes());

buf.extend(
u32::try_from(entry.parent_positions.len())
.unwrap()
.to_le_bytes(),
);
let mut parent1_pos = IndexPosition(0);
let parent_overflow_pos = u32::try_from(parent_overflow.len()).unwrap();
for (i, parent_pos) in entry.parent_positions.iter().enumerate() {
if i == 0 {
parent1_pos = *parent_pos;
} else {
parent_overflow.push(*parent_pos);
match entry.parent_positions.as_slice() {
[] => {
buf.extend((!0_u32).to_le_bytes());
buf.extend((!0_u32).to_le_bytes());
}
[pos1] => {
assert!(pos1.0 < OVERFLOW_FLAG);
buf.extend(pos1.0.to_le_bytes());
buf.extend((!0_u32).to_le_bytes());
}
[pos1, pos2] => {
assert!(pos1.0 < OVERFLOW_FLAG);
assert!(pos2.0 < OVERFLOW_FLAG);
buf.extend(pos1.0.to_le_bytes());
buf.extend(pos2.0.to_le_bytes());
}
positions => {
let overflow_pos = u32::try_from(parent_overflow.len()).unwrap();
let num_parents = u32::try_from(positions.len()).unwrap();
assert!(overflow_pos < OVERFLOW_FLAG);
assert!(num_parents < OVERFLOW_FLAG);
buf.extend((!overflow_pos).to_le_bytes());
buf.extend((!num_parents).to_le_bytes());
parent_overflow.extend_from_slice(positions);
}
}
buf.extend(parent1_pos.0.to_le_bytes());
buf.extend(parent_overflow_pos.to_le_bytes());

assert_eq!(entry.change_id.as_bytes().len(), self.change_id_length);
buf.extend_from_slice(entry.change_id.as_bytes());
Expand Down
87 changes: 57 additions & 30 deletions lib/src/default_index/readonly.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ use std::io::Read;
use std::path::Path;
use std::sync::Arc;

use smallvec::SmallVec;
use smallvec::smallvec;
use thiserror::Error;

use super::composite::{AsCompositeIndex, ChangeIdIndexImpl, CompositeIndex, IndexSegment};
Expand Down Expand Up @@ -73,7 +73,24 @@ impl ReadonlyIndexLoadError {
}

/// Current format version of the index segment file.
pub(crate) const INDEX_SEGMENT_FILE_FORMAT_VERSION: u32 = 3;
pub(crate) const INDEX_SEGMENT_FILE_FORMAT_VERSION: u32 = 4;

/// If set, the value is stored in the overflow table.
pub(crate) const OVERFLOW_FLAG: u32 = 0x8000_0000;

/// Global index position of parent entry, or overflow pointer.
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
struct ParentIndexPosition(u32);

impl ParentIndexPosition {
fn as_inlined(self) -> Option<IndexPosition> {
(self.0 & OVERFLOW_FLAG == 0).then_some(IndexPosition(self.0))
}

fn as_overflow(self) -> Option<u32> {
(self.0 & OVERFLOW_FLAG != 0).then_some(!self.0)
}
}

struct CommitGraphEntry<'a> {
data: &'a [u8],
Expand All @@ -85,23 +102,19 @@ struct CommitGraphEntry<'a> {
// lowest set bit to determine which generation number the pointers point to.
impl CommitGraphEntry<'_> {
fn size(commit_id_length: usize, change_id_length: usize) -> usize {
16 + commit_id_length + change_id_length
12 + commit_id_length + change_id_length
}

fn generation_number(&self) -> u32 {
u32::from_le_bytes(self.data[0..4].try_into().unwrap())
}

fn num_parents(&self) -> u32 {
u32::from_le_bytes(self.data[4..8].try_into().unwrap())
fn parent1_pos_or_overflow_pos(&self) -> ParentIndexPosition {
ParentIndexPosition(u32::from_le_bytes(self.data[4..8].try_into().unwrap()))
}

fn parent1_pos(&self) -> IndexPosition {
IndexPosition(u32::from_le_bytes(self.data[8..12].try_into().unwrap()))
}

fn parent2_overflow_pos(&self) -> u32 {
u32::from_le_bytes(self.data[12..16].try_into().unwrap())
fn parent2_pos_or_overflow_len(&self) -> ParentIndexPosition {
ParentIndexPosition(u32::from_le_bytes(self.data[8..12].try_into().unwrap()))
}

// TODO: Consider storing the change ids in a separate table. That table could
Expand All @@ -111,11 +124,11 @@ impl CommitGraphEntry<'_> {
// to better cache locality when walking it; ability to quickly find all
// commits associated with a change id.
fn change_id(&self) -> ChangeId {
ChangeId::new(self.data[16..][..self.change_id_length].to_vec())
ChangeId::new(self.data[12..][..self.change_id_length].to_vec())
}

fn commit_id(&self) -> CommitId {
CommitId::from_bytes(&self.data[16 + self.change_id_length..][..self.commit_id_length])
CommitId::from_bytes(&self.data[12 + self.change_id_length..][..self.commit_id_length])
}
}

Expand Down Expand Up @@ -156,9 +169,14 @@ impl CommitLookupEntry<'_> {
/// u32: number of overflow parent entries
/// for each entry, in some topological order with parents first:
/// u32: generation number
/// u32: number of parents
/// u32: global index position for parent 1
/// u32: position in the overflow table of parent 2
/// if number of parents <= 2:
/// u32: (< 0x8000_0000) global index position for parent 1
/// (==0xffff_ffff) no parent 1
/// u32: (< 0x8000_0000) global index position for parent 2
/// (==0xffff_ffff) no parent 2
/// else:
/// u32: (>=0x8000_0000) position in the overflow table, bit-negated
/// u32: (>=0x8000_0000) number of parents (in the overflow table), bit-negated
/// <change id length number of bytes>: change id
/// <commit id length number of bytes>: commit id
/// for each entry, sorted by commit id:
Expand Down Expand Up @@ -339,12 +357,14 @@ impl ReadonlyIndexSegment {
}
}

fn overflow_parent(&self, overflow_pos: u32) -> IndexPosition {
fn overflow_parents(&self, overflow_pos: u32, num_parents: u32) -> SmallIndexPositionsVec {
let offset = (overflow_pos as usize) * 4
+ (self.num_local_commits as usize) * self.commit_graph_entry_size
+ (self.num_local_commits as usize) * self.commit_lookup_entry_size;
let pos = u32::from_le_bytes(self.data[offset..][..4].try_into().unwrap());
IndexPosition(pos)
self.data[offset..][..(num_parents as usize) * 4]
.chunks_exact(4)
.map(|chunk| IndexPosition(u32::from_le_bytes(chunk.try_into().unwrap())))
.collect()
}

fn commit_id_byte_prefix_to_lookup_pos(&self, prefix: &CommitId) -> Option<u32> {
Expand Down Expand Up @@ -448,23 +468,30 @@ impl IndexSegment for ReadonlyIndexSegment {
}

fn num_parents(&self, local_pos: LocalPosition) -> u32 {
self.graph_entry(local_pos).num_parents()
let graph_entry = self.graph_entry(local_pos);
let pos1_or_overflow_pos = graph_entry.parent1_pos_or_overflow_pos();
let pos2_or_overflow_len = graph_entry.parent2_pos_or_overflow_len();
let inlined_len1 = pos1_or_overflow_pos.as_inlined().is_some() as u32;
let inlined_len2 = pos2_or_overflow_len.as_inlined().is_some() as u32;
let overflow_len = pos2_or_overflow_len.as_overflow().unwrap_or(0);
inlined_len1 + inlined_len2 + overflow_len
}

fn parent_positions(&self, local_pos: LocalPosition) -> SmallIndexPositionsVec {
let graph_entry = self.graph_entry(local_pos);
let mut parent_entries = SmallVec::with_capacity(graph_entry.num_parents() as usize);
if graph_entry.num_parents() >= 1 {
parent_entries.push(graph_entry.parent1_pos());
}
if graph_entry.num_parents() >= 2 {
let mut parent_overflow_pos = graph_entry.parent2_overflow_pos();
for _ in 1..graph_entry.num_parents() {
parent_entries.push(self.overflow_parent(parent_overflow_pos));
parent_overflow_pos += 1;
let pos1_or_overflow_pos = graph_entry.parent1_pos_or_overflow_pos();
let pos2_or_overflow_len = graph_entry.parent2_pos_or_overflow_len();
if let Some(pos1) = pos1_or_overflow_pos.as_inlined() {
if let Some(pos2) = pos2_or_overflow_len.as_inlined() {
smallvec![pos1, pos2]
} else {
smallvec![pos1]
}
} else {
let overflow_pos = pos1_or_overflow_pos.as_overflow().unwrap();
let num_parents = pos2_or_overflow_len.as_overflow().unwrap();
self.overflow_parents(overflow_pos, num_parents)
}
parent_entries
}
}

Expand Down

0 comments on commit 60883a3

Please sign in to comment.