From 016cadb33efd12d98e4a7d9157b6f97552aa9d9d Mon Sep 17 00:00:00 2001 From: Yuya Nishihara Date: Sun, 11 Feb 2024 15:59:33 +0900 Subject: [PATCH 1/5] index: clarify parent entries are global positions I'm going to add change id overflow table whose elements are of LocalPosition type. Let's make sure that the serialization code would break if we changed the underlying data type. --- lib/src/default_index/mutable.rs | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/lib/src/default_index/mutable.rs b/lib/src/default_index/mutable.rs index d5e6a7987f..26c1dd623d 100644 --- a/lib/src/default_index/mutable.rs +++ b/lib/src/default_index/mutable.rs @@ -203,16 +203,16 @@ impl MutableIndexSegment { buf.extend((!0_u32).to_le_bytes()); buf.extend((!0_u32).to_le_bytes()); } - [pos1] => { - assert!(pos1.0 < OVERFLOW_FLAG); - buf.extend(pos1.0.to_le_bytes()); + [IndexPosition(pos1)] => { + assert!(*pos1 < OVERFLOW_FLAG); + buf.extend(pos1.to_le_bytes()); buf.extend((!0_u32).to_le_bytes()); } - [pos1, pos2] => { - assert!(pos1.0 < OVERFLOW_FLAG); - assert!(pos2.0 < OVERFLOW_FLAG); - buf.extend(pos1.0.to_le_bytes()); - buf.extend(pos2.0.to_le_bytes()); + [IndexPosition(pos1), IndexPosition(pos2)] => { + assert!(*pos1 < OVERFLOW_FLAG); + assert!(*pos2 < OVERFLOW_FLAG); + buf.extend(pos1.to_le_bytes()); + buf.extend(pos2.to_le_bytes()); } positions => { let overflow_pos = u32::try_from(parent_overflow.len()).unwrap(); @@ -237,10 +237,10 @@ impl MutableIndexSegment { buf.extend(pos.to_le_bytes()); } - buf[parent_overflow_offset..][..4] - .copy_from_slice(&u32::try_from(parent_overflow.len()).unwrap().to_le_bytes()); - for parent_pos in parent_overflow { - buf.extend(parent_pos.0.to_le_bytes()); + let num_parent_overflow = u32::try_from(parent_overflow.len()).unwrap(); + buf[parent_overflow_offset..][..4].copy_from_slice(&num_parent_overflow.to_le_bytes()); + for IndexPosition(pos) in parent_overflow { + buf.extend(pos.to_le_bytes()); } } From b946bb6f7fcc6ccd1a8daab1767f4171b9269b8f Mon Sep 17 00:00:00 2001 From: Yuya Nishihara Date: Fri, 22 Dec 2023 15:28:50 +0900 Subject: [PATCH 2/5] index: move change ids to sstable, build change-id-to-pos lookup table This basically means that the change ids are interned. We'll implement binary search over the sorted change ids table. The table could be sorted differently for better cache locality, but it is in lexicographical order for simplicity. With my testing, the cost of the id lookup isn't dominant. Unlike the parent entries, the size of the per-id overflow items isn't saved. That's s because the number of the same-change-id commits is either 1 or many. It doesn't make sense to allocate 8 bytes for each change id. Instead, we'll pay extra indirection cost to determine the size. --- lib/src/default_index/entry.rs | 1 + lib/src/default_index/mutable.rs | 71 +++++++++++++++++++++++++---- lib/src/default_index/readonly.rs | 76 ++++++++++++++++++++++--------- lib/tests/test_index.rs | 6 ++- 4 files changed, 120 insertions(+), 34 deletions(-) diff --git a/lib/src/default_index/entry.rs b/lib/src/default_index/entry.rs index ae9ed5084d..f9773c03ea 100644 --- a/lib/src/default_index/entry.rs +++ b/lib/src/default_index/entry.rs @@ -38,6 +38,7 @@ pub(super) struct LocalPosition(pub(super) u32); // SmallVec reuses two pointer-size fields as inline area, which meas we can // inline up to 16 bytes (on 64-bit platform) for free. pub(super) type SmallIndexPositionsVec = SmallVec<[IndexPosition; 4]>; +pub(super) type SmallLocalPositionsVec = SmallVec<[LocalPosition; 4]>; #[derive(Clone)] pub struct IndexEntry<'a> { diff --git a/lib/src/default_index/mutable.rs b/lib/src/default_index/mutable.rs index 26c1dd623d..3153feb254 100644 --- a/lib/src/default_index/mutable.rs +++ b/lib/src/default_index/mutable.rs @@ -16,7 +16,7 @@ use std::any::Any; use std::cmp::max; -use std::collections::BTreeMap; +use std::collections::{BTreeMap, HashMap}; use std::io; use std::io::Write; use std::ops::Bound; @@ -26,11 +26,11 @@ use std::sync::Arc; use blake2::Blake2b512; use digest::Digest; use itertools::Itertools; -use smallvec::SmallVec; +use smallvec::{smallvec, SmallVec}; use tempfile::NamedTempFile; use super::composite::{AsCompositeIndex, ChangeIdIndexImpl, CompositeIndex, IndexSegment}; -use super::entry::{IndexPosition, LocalPosition, SmallIndexPositionsVec}; +use super::entry::{IndexPosition, LocalPosition, SmallIndexPositionsVec, SmallLocalPositionsVec}; use super::readonly::{ DefaultReadonlyIndex, ReadonlyIndexSegment, INDEX_SEGMENT_FILE_FORMAT_VERSION, OVERFLOW_FLAG, }; @@ -57,6 +57,7 @@ pub(super) struct MutableIndexSegment { change_id_length: usize, graph: Vec, commit_lookup: BTreeMap, + change_lookup: BTreeMap, } impl MutableIndexSegment { @@ -68,6 +69,7 @@ impl MutableIndexSegment { change_id_length, graph: vec![], commit_lookup: BTreeMap::new(), + change_lookup: BTreeMap::new(), } } @@ -82,6 +84,7 @@ impl MutableIndexSegment { change_id_length, graph: vec![], commit_lookup: BTreeMap::new(), + change_lookup: BTreeMap::new(), } } @@ -123,10 +126,14 @@ impl MutableIndexSegment { ); entry.parent_positions.push(parent_entry.position()); } - self.commit_lookup.insert( - entry.commit_id.clone(), - LocalPosition(u32::try_from(self.graph.len()).unwrap()), - ); + let local_pos = LocalPosition(u32::try_from(self.graph.len()).unwrap()); + self.commit_lookup + .insert(entry.commit_id.clone(), local_pos); + self.change_lookup + .entry(entry.change_id.clone()) + // positions are inherently sorted + .and_modify(|positions| positions.push(local_pos)) + .or_insert(smallvec![local_pos]); self.graph.push(entry); } @@ -187,12 +194,28 @@ impl MutableIndexSegment { fn serialize_local_entries(&self, buf: &mut Vec) { assert_eq!(self.graph.len(), self.commit_lookup.len()); + debug_assert_eq!( + self.graph.len(), + self.change_lookup.values().flatten().count() + ); let num_commits = u32::try_from(self.graph.len()).unwrap(); buf.extend(num_commits.to_le_bytes()); - // We'll write the actual value later + let num_change_ids = u32::try_from(self.change_lookup.len()).unwrap(); + buf.extend(num_change_ids.to_le_bytes()); + // We'll write the actual values later let parent_overflow_offset = buf.len(); buf.extend(0_u32.to_le_bytes()); + let change_overflow_offset = buf.len(); + buf.extend(0_u32.to_le_bytes()); + + // Positions of change ids in the sorted table + let change_id_pos_map: HashMap<&ChangeId, u32> = self + .change_lookup + .keys() + .enumerate() + .map(|(i, change_id)| (change_id, u32::try_from(i).unwrap())) + .collect(); let mut parent_overflow = vec![]; for entry in &self.graph { @@ -225,8 +248,7 @@ impl MutableIndexSegment { } } - assert_eq!(entry.change_id.as_bytes().len(), self.change_id_length); - buf.extend_from_slice(entry.change_id.as_bytes()); + buf.extend(change_id_pos_map[&entry.change_id].to_le_bytes()); assert_eq!(entry.commit_id.as_bytes().len(), self.commit_id_length); buf.extend_from_slice(entry.commit_id.as_bytes()); @@ -237,11 +259,40 @@ impl MutableIndexSegment { buf.extend(pos.to_le_bytes()); } + for change_id in self.change_lookup.keys() { + assert_eq!(change_id.as_bytes().len(), self.change_id_length); + buf.extend_from_slice(change_id.as_bytes()); + } + + let mut change_overflow = vec![]; + for positions in self.change_lookup.values() { + match positions.as_slice() { + [] => panic!("change id lookup entry must not be empty"), + // Optimize for imported commits + [LocalPosition(pos1)] => { + assert!(*pos1 < OVERFLOW_FLAG); + buf.extend(pos1.to_le_bytes()); + } + positions => { + let overflow_pos = u32::try_from(change_overflow.len()).unwrap(); + assert!(overflow_pos < OVERFLOW_FLAG); + buf.extend((!overflow_pos).to_le_bytes()); + change_overflow.extend_from_slice(positions); + } + } + } + let num_parent_overflow = u32::try_from(parent_overflow.len()).unwrap(); buf[parent_overflow_offset..][..4].copy_from_slice(&num_parent_overflow.to_le_bytes()); for IndexPosition(pos) in parent_overflow { buf.extend(pos.to_le_bytes()); } + + let num_change_overflow = u32::try_from(change_overflow.len()).unwrap(); + buf[change_overflow_offset..][..4].copy_from_slice(&num_change_overflow.to_le_bytes()); + for LocalPosition(pos) in change_overflow { + buf.extend(pos.to_le_bytes()); + } } /// If the MutableIndex has more than half the commits of its parent diff --git a/lib/src/default_index/readonly.rs b/lib/src/default_index/readonly.rs index bee735a934..656578d71f 100644 --- a/lib/src/default_index/readonly.rs +++ b/lib/src/default_index/readonly.rs @@ -73,7 +73,7 @@ impl ReadonlyIndexLoadError { } /// Current format version of the index segment file. -pub(crate) const INDEX_SEGMENT_FILE_FORMAT_VERSION: u32 = 4; +pub(crate) const INDEX_SEGMENT_FILE_FORMAT_VERSION: u32 = 5; /// If set, the value is stored in the overflow table. pub(crate) const OVERFLOW_FLAG: u32 = 0x8000_0000; @@ -94,15 +94,13 @@ impl ParentIndexPosition { struct CommitGraphEntry<'a> { data: &'a [u8], - commit_id_length: usize, - change_id_length: usize, } // TODO: Add pointers to ancestors further back, like a skip list. Clear the // lowest set bit to determine which generation number the pointers point to. impl CommitGraphEntry<'_> { - fn size(commit_id_length: usize, change_id_length: usize) -> usize { - 12 + commit_id_length + change_id_length + fn size(commit_id_length: usize) -> usize { + 16 + commit_id_length } fn generation_number(&self) -> u32 { @@ -117,18 +115,12 @@ impl CommitGraphEntry<'_> { ParentIndexPosition(u32::from_le_bytes(self.data[8..12].try_into().unwrap())) } - // TODO: Consider storing the change ids in a separate table. That table could - // be sorted by change id and have the end index into a list as value. That list - // would be the concatenation of all index positions associated with the change. - // Possible advantages: avoids duplicating change ids; smaller main graph leads - // to better cache locality when walking it; ability to quickly find all - // commits associated with a change id. - fn change_id(&self) -> ChangeId { - ChangeId::new(self.data[12..][..self.change_id_length].to_vec()) + fn change_id_lookup_pos(&self) -> u32 { + u32::from_le_bytes(self.data[12..16].try_into().unwrap()) } fn commit_id(&self) -> CommitId { - CommitId::from_bytes(&self.data[12 + self.change_id_length..][..self.commit_id_length]) + CommitId::from_bytes(&self.data[16..]) } } @@ -165,8 +157,10 @@ impl CommitLookupEntry<'_> { /// u32: parent segment file name length (0 means root) /// : parent segment file name /// -/// u32: number of local entries +/// u32: number of local commit entries +/// u32: number of local change ids /// u32: number of overflow parent entries +/// u32: number of overflow change id positions /// for each entry, in some topological order with parents first: /// u32: generation number /// if number of parents <= 2: @@ -177,13 +171,22 @@ impl CommitLookupEntry<'_> { /// else: /// u32: (>=0x8000_0000) position in the overflow table, bit-negated /// u32: (>=0x8000_0000) number of parents (in the overflow table), bit-negated -/// : change id +/// u32: change id position in the sorted change ids table /// : commit id /// for each entry, sorted by commit id: /// : commit id /// u32: local position in the graph entries table +/// for each entry, sorted by change id: +/// : change id +/// for each entry, sorted by change id: +/// if number of associated commits == 1: +/// u32: (< 0x8000_0000) local position in the graph entries table +/// else: +/// u32: (>=0x8000_0000) position in the overflow table, bit-negated /// for each overflow parent: /// u32: global index position +/// for each overflow change id entry: +/// u32: local position in the graph entries table /// ``` /// /// Note that u32 fields are 4-byte aligned so long as the parent file name @@ -201,6 +204,8 @@ pub(super) struct ReadonlyIndexSegment { commit_lookup_entry_size: usize, // Number of commits not counting the parent file num_local_commits: u32, + num_local_change_ids: u32, + num_parent_overflow_entries: u32, data: Vec, } @@ -293,15 +298,25 @@ impl ReadonlyIndexSegment { .as_ref() .map_or(0, |segment| segment.as_composite().num_commits()); let num_local_commits = read_u32(file)?; + let num_local_change_ids = read_u32(file)?; let num_parent_overflow_entries = read_u32(file)?; + let num_change_overflow_entries = read_u32(file)?; let mut data = vec![]; file.read_to_end(&mut data).map_err(from_io_err)?; - let commit_graph_entry_size = CommitGraphEntry::size(commit_id_length, change_id_length); + let commit_graph_entry_size = CommitGraphEntry::size(commit_id_length); let graph_size = (num_local_commits as usize) * commit_graph_entry_size; let commit_lookup_entry_size = CommitLookupEntry::size(commit_id_length); let commit_lookup_size = (num_local_commits as usize) * commit_lookup_entry_size; + let change_id_table_size = (num_local_change_ids as usize) * change_id_length; + let change_pos_table_size = (num_local_change_ids as usize) * 4; let parent_overflow_size = (num_parent_overflow_entries as usize) * 4; - let expected_size = graph_size + commit_lookup_size + parent_overflow_size; + let change_overflow_size = (num_change_overflow_entries as usize) * 4; + let expected_size = graph_size + + commit_lookup_size + + change_id_table_size + + change_pos_table_size + + parent_overflow_size + + change_overflow_size; if data.len() != expected_size { return Err(ReadonlyIndexLoadError::invalid_data( name, @@ -317,6 +332,8 @@ impl ReadonlyIndexSegment { commit_graph_entry_size, commit_lookup_entry_size, num_local_commits, + num_local_change_ids, + num_parent_overflow_entries, data, })) } @@ -342,8 +359,6 @@ impl ReadonlyIndexSegment { let offset = (local_pos.0 as usize) * self.commit_graph_entry_size; CommitGraphEntry { data: &self.data[offset..][..self.commit_graph_entry_size], - commit_id_length: self.commit_id_length, - change_id_length: self.change_id_length, } } @@ -357,10 +372,26 @@ impl ReadonlyIndexSegment { } } + fn change_lookup_id(&self, lookup_pos: u32) -> ChangeId { + ChangeId::from_bytes(self.change_lookup_id_bytes(lookup_pos)) + } + + // might be better to add borrowed version of ChangeId + fn change_lookup_id_bytes(&self, lookup_pos: u32) -> &[u8] { + assert!(lookup_pos < self.num_local_change_ids); + let offset = (lookup_pos as usize) * self.change_id_length + + (self.num_local_commits as usize) * self.commit_graph_entry_size + + (self.num_local_commits as usize) * self.commit_lookup_entry_size; + &self.data[offset..][..self.change_id_length] + } + fn overflow_parents(&self, overflow_pos: u32, num_parents: u32) -> SmallIndexPositionsVec { + assert!(overflow_pos + num_parents <= self.num_parent_overflow_entries); let offset = (overflow_pos as usize) * 4 + (self.num_local_commits as usize) * self.commit_graph_entry_size - + (self.num_local_commits as usize) * self.commit_lookup_entry_size; + + (self.num_local_commits as usize) * self.commit_lookup_entry_size + + (self.num_local_change_ids as usize) * self.change_id_length + + (self.num_local_change_ids as usize) * 4; self.data[offset..][..(num_parents as usize) * 4] .chunks_exact(4) .map(|chunk| IndexPosition(u32::from_le_bytes(chunk.try_into().unwrap()))) @@ -423,7 +454,8 @@ impl IndexSegment for ReadonlyIndexSegment { } fn change_id(&self, local_pos: LocalPosition) -> ChangeId { - self.graph_entry(local_pos).change_id() + let entry = self.graph_entry(local_pos); + self.change_lookup_id(entry.change_id_lookup_pos()) } fn num_parents(&self, local_pos: LocalPosition) -> u32 { diff --git a/lib/tests/test_index.rs b/lib/tests/test_index.rs index 7912064ca0..f08b420959 100644 --- a/lib/tests/test_index.rs +++ b/lib/tests/test_index.rs @@ -624,9 +624,11 @@ fn test_reindex_corrupt_segment_files() { let entry = entry.unwrap(); // u32: file format version // u32: parent segment file name length (0 means root) - // u32: number of local entries + // u32: number of local commit entries + // u32: number of local change ids // u32: number of overflow parent entries - fs::write(entry.path(), b"\0".repeat(16)).unwrap() + // u32: number of overflow change id positions + fs::write(entry.path(), b"\0".repeat(24)).unwrap() } let repo = load_repo_at_head(&settings, repo.repo_path()); From bc3f444c7feae2e2c3d1cfe2e94f4afc09137d5e Mon Sep 17 00:00:00 2001 From: Yuya Nishihara Date: Fri, 22 Dec 2023 16:02:36 +0900 Subject: [PATCH 3/5] index: implement segment-level change id lookup methods In resolve_change_id_prefix(), I've implemented two different ways of collecting the overflow items. I don't think they impact the performance, but we can switch to the alternative method as needed. --- lib/src/default_index/composite.rs | 11 ++ lib/src/default_index/mod.rs | 204 ++++++++++++++++++++++++++++- lib/src/default_index/mutable.rs | 26 +++- lib/src/default_index/readonly.rs | 98 +++++++++++++- 4 files changed, 330 insertions(+), 9 deletions(-) diff --git a/lib/src/default_index/composite.rs b/lib/src/default_index/composite.rs index c4fbf5fd90..79f2aa6fcc 100644 --- a/lib/src/default_index/composite.rs +++ b/lib/src/default_index/composite.rs @@ -23,6 +23,7 @@ use itertools::Itertools; use super::entry::{ IndexEntry, IndexPosition, IndexPositionByGeneration, LocalPosition, SmallIndexPositionsVec, + SmallLocalPositionsVec, }; use super::readonly::ReadonlyIndexSegment; use super::rev_walk::RevWalk; @@ -55,6 +56,16 @@ pub(super) trait IndexSegment: Send + Sync { fn resolve_commit_id_prefix(&self, prefix: &HexPrefix) -> PrefixResolution; + fn resolve_neighbor_change_ids( + &self, + change_id: &ChangeId, + ) -> (Option, Option); + + fn resolve_change_id_prefix( + &self, + prefix: &HexPrefix, + ) -> PrefixResolution<(ChangeId, SmallLocalPositionsVec)>; + fn generation_number(&self, local_pos: LocalPosition) -> u32; fn commit_id(&self, local_pos: LocalPosition) -> CommitId; diff --git a/lib/src/default_index/mod.rs b/lib/src/default_index/mod.rs index 1b94ac970b..a11c710cf0 100644 --- a/lib/src/default_index/mod.rs +++ b/lib/src/default_index/mod.rs @@ -46,9 +46,16 @@ mod tests { use super::mutable::MutableIndexSegment; use super::*; use crate::backend::{ChangeId, CommitId}; + use crate::default_index::entry::{LocalPosition, SmallLocalPositionsVec}; use crate::index::Index; use crate::object_id::{HexPrefix, ObjectId, PrefixResolution}; + /// Generator of unique 16-byte CommitId excluding root id + fn commit_id_generator() -> impl FnMut() -> CommitId { + let mut iter = (1_u128..).map(|n| CommitId::new(n.to_le_bytes().into())); + move || iter.next().unwrap() + } + /// Generator of unique 16-byte ChangeId excluding root id fn change_id_generator() -> impl FnMut() -> ChangeId { let mut iter = (1_u128..).map(|n| ChangeId::new(n.to_le_bytes().into())); @@ -328,7 +335,7 @@ mod tests { } #[test] - fn resolve_prefix() { + fn resolve_commit_id_prefix() { let temp_dir = testutils::new_temp_dir(); let mut new_change_id = change_id_generator(); let mut mutable_segment = MutableIndexSegment::full(3, 16); @@ -581,6 +588,201 @@ mod tests { ); } + #[test] + fn resolve_change_id_prefix() { + let temp_dir = testutils::new_temp_dir(); + let mut new_commit_id = commit_id_generator(); + let local_positions_vec = |positions: &[u32]| -> SmallLocalPositionsVec { + positions.iter().copied().map(LocalPosition).collect() + }; + + let id_0 = ChangeId::from_hex("00000001"); + let id_1 = ChangeId::from_hex("00999999"); + let id_2 = ChangeId::from_hex("05548888"); + let id_3 = ChangeId::from_hex("05544444"); + let id_4 = ChangeId::from_hex("05555555"); + let id_5 = ChangeId::from_hex("05555333"); + + // Create some commits with different various common prefixes. + let mut mutable_segment = MutableIndexSegment::full(16, 4); + mutable_segment.add_commit_data(new_commit_id(), id_0.clone(), &[]); + mutable_segment.add_commit_data(new_commit_id(), id_1.clone(), &[]); + mutable_segment.add_commit_data(new_commit_id(), id_2.clone(), &[]); + mutable_segment.add_commit_data(new_commit_id(), id_1.clone(), &[]); + mutable_segment.add_commit_data(new_commit_id(), id_2.clone(), &[]); + mutable_segment.add_commit_data(new_commit_id(), id_2.clone(), &[]); + + // Write these commits to one file and build the remainder on top. + let initial_file = mutable_segment.save_in(temp_dir.path()).unwrap(); + mutable_segment = MutableIndexSegment::incremental(initial_file.clone()); + + mutable_segment.add_commit_data(new_commit_id(), id_3.clone(), &[]); + mutable_segment.add_commit_data(new_commit_id(), id_3.clone(), &[]); + mutable_segment.add_commit_data(new_commit_id(), id_4.clone(), &[]); + mutable_segment.add_commit_data(new_commit_id(), id_1.clone(), &[]); + mutable_segment.add_commit_data(new_commit_id(), id_5.clone(), &[]); + + // Local lookup in readonly index with the full hex digits + assert_eq!( + initial_file.resolve_change_id_prefix(&HexPrefix::new(&id_0.hex()).unwrap()), + PrefixResolution::SingleMatch((id_0.clone(), local_positions_vec(&[0]))) + ); + assert_eq!( + initial_file.resolve_change_id_prefix(&HexPrefix::new(&id_1.hex()).unwrap()), + PrefixResolution::SingleMatch((id_1.clone(), local_positions_vec(&[1, 3]))) + ); + assert_eq!( + initial_file.resolve_change_id_prefix(&HexPrefix::new(&id_2.hex()).unwrap()), + PrefixResolution::SingleMatch((id_2.clone(), local_positions_vec(&[2, 4, 5]))) + ); + + // Local lookup in mutable index with the full hex digits + assert_eq!( + mutable_segment.resolve_change_id_prefix(&HexPrefix::new(&id_1.hex()).unwrap()), + PrefixResolution::SingleMatch((id_1.clone(), local_positions_vec(&[3]))) + ); + assert_eq!( + mutable_segment.resolve_change_id_prefix(&HexPrefix::new(&id_3.hex()).unwrap()), + PrefixResolution::SingleMatch((id_3.clone(), local_positions_vec(&[0, 1]))) + ); + assert_eq!( + mutable_segment.resolve_change_id_prefix(&HexPrefix::new(&id_4.hex()).unwrap()), + PrefixResolution::SingleMatch((id_4.clone(), local_positions_vec(&[2]))) + ); + assert_eq!( + mutable_segment.resolve_change_id_prefix(&HexPrefix::new(&id_5.hex()).unwrap()), + PrefixResolution::SingleMatch((id_5.clone(), local_positions_vec(&[4]))) + ); + + // Local lookup with locally unknown prefix + assert_eq!( + initial_file.resolve_change_id_prefix(&HexPrefix::new("0555").unwrap()), + PrefixResolution::NoMatch + ); + assert_eq!( + mutable_segment.resolve_change_id_prefix(&HexPrefix::new("000").unwrap()), + PrefixResolution::NoMatch + ); + + // Local lookup with locally unique prefix + assert_eq!( + initial_file.resolve_change_id_prefix(&HexPrefix::new("0554").unwrap()), + PrefixResolution::SingleMatch((id_2.clone(), local_positions_vec(&[2, 4, 5]))) + ); + assert_eq!( + mutable_segment.resolve_change_id_prefix(&HexPrefix::new("0554").unwrap()), + PrefixResolution::SingleMatch((id_3.clone(), local_positions_vec(&[0, 1]))) + ); + + // Local lookup with locally ambiguous prefix + assert_eq!( + initial_file.resolve_change_id_prefix(&HexPrefix::new("00").unwrap()), + PrefixResolution::AmbiguousMatch + ); + assert_eq!( + mutable_segment.resolve_change_id_prefix(&HexPrefix::new("05555").unwrap()), + PrefixResolution::AmbiguousMatch + ); + } + + #[test] + fn neighbor_change_ids() { + let temp_dir = testutils::new_temp_dir(); + let mut new_commit_id = commit_id_generator(); + + let id_0 = ChangeId::from_hex("00000001"); + let id_1 = ChangeId::from_hex("00999999"); + let id_2 = ChangeId::from_hex("05548888"); + let id_3 = ChangeId::from_hex("05544444"); + let id_4 = ChangeId::from_hex("05555555"); + let id_5 = ChangeId::from_hex("05555333"); + + // Create some commits with different various common prefixes. + let mut mutable_segment = MutableIndexSegment::full(16, 4); + mutable_segment.add_commit_data(new_commit_id(), id_0.clone(), &[]); + mutable_segment.add_commit_data(new_commit_id(), id_1.clone(), &[]); + mutable_segment.add_commit_data(new_commit_id(), id_2.clone(), &[]); + mutable_segment.add_commit_data(new_commit_id(), id_1.clone(), &[]); + mutable_segment.add_commit_data(new_commit_id(), id_2.clone(), &[]); + mutable_segment.add_commit_data(new_commit_id(), id_2.clone(), &[]); + + // Write these commits to one file and build the remainder on top. + let initial_file = mutable_segment.save_in(temp_dir.path()).unwrap(); + mutable_segment = MutableIndexSegment::incremental(initial_file.clone()); + + mutable_segment.add_commit_data(new_commit_id(), id_3.clone(), &[]); + mutable_segment.add_commit_data(new_commit_id(), id_3.clone(), &[]); + mutable_segment.add_commit_data(new_commit_id(), id_4.clone(), &[]); + mutable_segment.add_commit_data(new_commit_id(), id_1.clone(), &[]); + mutable_segment.add_commit_data(new_commit_id(), id_5.clone(), &[]); + + // Local lookup in readonly index, change_id exists. + assert_eq!( + initial_file.resolve_neighbor_change_ids(&id_0), + (None, Some(id_1.clone())), + ); + assert_eq!( + initial_file.resolve_neighbor_change_ids(&id_1), + (Some(id_0.clone()), Some(id_2.clone())), + ); + assert_eq!( + initial_file.resolve_neighbor_change_ids(&id_2), + (Some(id_1.clone()), None), + ); + + // Local lookup in readonly index, change_id does not exist. + assert_eq!( + initial_file.resolve_neighbor_change_ids(&ChangeId::from_hex("00000000")), + (None, Some(id_0.clone())), + ); + assert_eq!( + initial_file.resolve_neighbor_change_ids(&ChangeId::from_hex("00000002")), + (Some(id_0.clone()), Some(id_1.clone())), + ); + assert_eq!( + initial_file.resolve_neighbor_change_ids(&ChangeId::from_hex("ffffffff")), + (Some(id_2.clone()), None), + ); + + // Local lookup in mutable index, change_id exists. + // id_1 < id_3 < id_5 < id_4 + assert_eq!( + mutable_segment.resolve_neighbor_change_ids(&id_1), + (None, Some(id_3.clone())), + ); + assert_eq!( + mutable_segment.resolve_neighbor_change_ids(&id_3), + (Some(id_1.clone()), Some(id_5.clone())), + ); + assert_eq!( + mutable_segment.resolve_neighbor_change_ids(&id_5), + (Some(id_3.clone()), Some(id_4.clone())), + ); + assert_eq!( + mutable_segment.resolve_neighbor_change_ids(&id_4), + (Some(id_5.clone()), None), + ); + + // Local lookup in mutable index, change_id does not exist. + // id_1 < id_3 < id_5 < id_4 + assert_eq!( + mutable_segment.resolve_neighbor_change_ids(&ChangeId::from_hex("00999998")), + (None, Some(id_1.clone())), + ); + assert_eq!( + mutable_segment.resolve_neighbor_change_ids(&ChangeId::from_hex("01000000")), + (Some(id_1.clone()), Some(id_3.clone())), + ); + assert_eq!( + mutable_segment.resolve_neighbor_change_ids(&ChangeId::from_hex("05555332")), + (Some(id_3.clone()), Some(id_5.clone())), + ); + assert_eq!( + mutable_segment.resolve_neighbor_change_ids(&ChangeId::from_hex("ffffffff")), + (Some(id_4.clone()), None), + ); + } + #[test] fn test_is_ancestor() { let mut new_change_id = change_id_generator(); diff --git a/lib/src/default_index/mutable.rs b/lib/src/default_index/mutable.rs index 3153feb254..d979af78d8 100644 --- a/lib/src/default_index/mutable.rs +++ b/lib/src/default_index/mutable.rs @@ -391,7 +391,24 @@ impl IndexSegment for MutableIndexSegment { fn resolve_commit_id_prefix(&self, prefix: &HexPrefix) -> PrefixResolution { let min_bytes_prefix = CommitId::from_bytes(prefix.min_prefix_bytes()); - resolve_id_prefix(&self.commit_lookup, prefix, &min_bytes_prefix).map(|id| id.clone()) + resolve_id_prefix(&self.commit_lookup, prefix, &min_bytes_prefix).map(|(id, _)| id.clone()) + } + + fn resolve_neighbor_change_ids( + &self, + change_id: &ChangeId, + ) -> (Option, Option) { + let (prev_id, next_id) = resolve_neighbor_ids(&self.change_lookup, change_id); + (prev_id.cloned(), next_id.cloned()) + } + + fn resolve_change_id_prefix( + &self, + prefix: &HexPrefix, + ) -> PrefixResolution<(ChangeId, SmallLocalPositionsVec)> { + let min_bytes_prefix = ChangeId::from_bytes(prefix.min_prefix_bytes()); + resolve_id_prefix(&self.change_lookup, prefix, &min_bytes_prefix) + .map(|(id, positions)| (id.clone(), positions.clone())) } fn generation_number(&self, local_pos: LocalPosition) -> u32 { @@ -551,14 +568,13 @@ fn resolve_id_prefix<'a, K: ObjectId + Ord, V>( lookup_table: &'a BTreeMap, prefix: &HexPrefix, min_bytes_prefix: &K, -) -> PrefixResolution<&'a K> { +) -> PrefixResolution<(&'a K, &'a V)> { let mut matches = lookup_table .range((Bound::Included(min_bytes_prefix), Bound::Unbounded)) - .map(|(id, _pos)| id) - .take_while(|&id| prefix.matches(id)) + .take_while(|&(id, _)| prefix.matches(id)) .fuse(); match (matches.next(), matches.next()) { - (Some(id), None) => PrefixResolution::SingleMatch(id), + (Some(entry), None) => PrefixResolution::SingleMatch(entry), (Some(_), Some(_)) => PrefixResolution::AmbiguousMatch, (None, _) => PrefixResolution::NoMatch, } diff --git a/lib/src/default_index/readonly.rs b/lib/src/default_index/readonly.rs index 656578d71f..688729ff15 100644 --- a/lib/src/default_index/readonly.rs +++ b/lib/src/default_index/readonly.rs @@ -27,7 +27,7 @@ use smallvec::smallvec; use thiserror::Error; use super::composite::{AsCompositeIndex, ChangeIdIndexImpl, CompositeIndex, IndexSegment}; -use super::entry::{IndexPosition, LocalPosition, SmallIndexPositionsVec}; +use super::entry::{IndexPosition, LocalPosition, SmallIndexPositionsVec, SmallLocalPositionsVec}; use super::mutable::DefaultMutableIndex; use crate::backend::{ChangeId, CommitId}; use crate::index::{AllHeadsForGcUnsupported, ChangeIdIndex, Index, MutableIndex, ReadonlyIndex}; @@ -92,6 +92,20 @@ impl ParentIndexPosition { } } +/// Local position of entry pointed by change id, or overflow pointer. +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] +struct ChangeLocalPosition(u32); + +impl ChangeLocalPosition { + fn as_inlined(self) -> Option { + (self.0 & OVERFLOW_FLAG == 0).then_some(LocalPosition(self.0)) + } + + fn as_overflow(self) -> Option { + (self.0 & OVERFLOW_FLAG != 0).then_some(!self.0) + } +} + struct CommitGraphEntry<'a> { data: &'a [u8], } @@ -206,6 +220,7 @@ pub(super) struct ReadonlyIndexSegment { num_local_commits: u32, num_local_change_ids: u32, num_parent_overflow_entries: u32, + num_change_overflow_entries: u32, data: Vec, } @@ -334,6 +349,7 @@ impl ReadonlyIndexSegment { num_local_commits, num_local_change_ids, num_parent_overflow_entries, + num_change_overflow_entries, data, })) } @@ -385,6 +401,17 @@ impl ReadonlyIndexSegment { &self.data[offset..][..self.change_id_length] } + fn change_lookup_pos(&self, lookup_pos: u32) -> ChangeLocalPosition { + assert!(lookup_pos < self.num_local_change_ids); + let offset = (lookup_pos as usize) * 4 + + (self.num_local_commits as usize) * self.commit_graph_entry_size + + (self.num_local_commits as usize) * self.commit_lookup_entry_size + + (self.num_local_change_ids as usize) * self.change_id_length; + ChangeLocalPosition(u32::from_le_bytes( + self.data[offset..][..4].try_into().unwrap(), + )) + } + fn overflow_parents(&self, overflow_pos: u32, num_parents: u32) -> SmallIndexPositionsVec { assert!(overflow_pos + num_parents <= self.num_parent_overflow_entries); let offset = (overflow_pos as usize) * 4 @@ -398,6 +425,20 @@ impl ReadonlyIndexSegment { .collect() } + /// Scans graph entry positions stored in the overflow change ids table. + fn overflow_changes_from(&self, overflow_pos: u32) -> impl Iterator + '_ { + let base = (self.num_local_commits as usize) * self.commit_graph_entry_size + + (self.num_local_commits as usize) * self.commit_lookup_entry_size + + (self.num_local_change_ids as usize) * self.change_id_length + + (self.num_local_change_ids as usize) * 4 + + (self.num_parent_overflow_entries as usize) * 4; + let size = (self.num_change_overflow_entries as usize) * 4; + let offset = (overflow_pos as usize) * 4; + self.data[base..][..size][offset..] + .chunks_exact(4) + .map(|chunk| LocalPosition(u32::from_le_bytes(chunk.try_into().unwrap()))) + } + /// Binary searches commit id by `prefix`. Returns the lookup position. fn commit_id_byte_prefix_to_lookup_pos(&self, prefix: &[u8]) -> PositionLookupResult { binary_search_pos_by(self.num_local_commits, |pos| { @@ -405,6 +446,14 @@ impl ReadonlyIndexSegment { entry.commit_id_bytes().cmp(prefix) }) } + + /// Binary searches change id by `prefix`. Returns the lookup position. + fn change_id_byte_prefix_to_lookup_pos(&self, prefix: &[u8]) -> PositionLookupResult { + binary_search_pos_by(self.num_local_change_ids, |pos| { + let change_id_bytes = self.change_lookup_id_bytes(pos); + change_id_bytes.cmp(prefix) + }) + } } impl IndexSegment for ReadonlyIndexSegment { @@ -443,6 +492,49 @@ impl IndexSegment for ReadonlyIndexSegment { fn resolve_commit_id_prefix(&self, prefix: &HexPrefix) -> PrefixResolution { self.commit_id_byte_prefix_to_lookup_pos(prefix.min_prefix_bytes()) .prefix_matches(prefix, |pos| self.commit_lookup_entry(pos).commit_id()) + .map(|(id, _)| id) + } + + fn resolve_neighbor_change_ids( + &self, + change_id: &ChangeId, + ) -> (Option, Option) { + self.change_id_byte_prefix_to_lookup_pos(change_id.as_bytes()) + .map_neighbors(|pos| self.change_lookup_id(pos)) + } + + fn resolve_change_id_prefix( + &self, + prefix: &HexPrefix, + ) -> PrefixResolution<(ChangeId, SmallLocalPositionsVec)> { + self.change_id_byte_prefix_to_lookup_pos(prefix.min_prefix_bytes()) + .prefix_matches(prefix, |pos| self.change_lookup_id(pos)) + .map(|(id, lookup_pos)| { + let change_pos = self.change_lookup_pos(lookup_pos); + if let Some(local_pos) = change_pos.as_inlined() { + (id, smallvec![local_pos]) + } else { + let overflow_pos = change_pos.as_overflow().unwrap(); + // Collect commits having the same change id. For cache + // locality, it might be better to look for the next few + // change id positions to determine the size. + let positions: SmallLocalPositionsVec = self + .overflow_changes_from(overflow_pos) + .take_while(|&local_pos| { + let entry = self.graph_entry(local_pos); + entry.change_id_lookup_pos() == lookup_pos + }) + .collect(); + debug_assert_eq!( + overflow_pos + u32::try_from(positions.len()).unwrap(), + (lookup_pos + 1..self.num_local_change_ids) + .find_map(|lookup_pos| self.change_lookup_pos(lookup_pos).as_overflow()) + .unwrap_or(self.num_change_overflow_entries), + "all overflow positions to the next change id should be collected" + ); + (id, positions) + } + }) } fn generation_number(&self, local_pos: LocalPosition) -> u32 { @@ -609,14 +701,14 @@ impl PositionLookupResult { self, prefix: &HexPrefix, lookup: impl FnMut(u32) -> T, - ) -> PrefixResolution { + ) -> PrefixResolution<(T, u32)> { let lookup_pos = self.result.unwrap_or_else(|pos| pos); let mut matches = (lookup_pos..self.size) .map(lookup) .take_while(|id| prefix.matches(id)) .fuse(); match (matches.next(), matches.next()) { - (Some(id), None) => PrefixResolution::SingleMatch(id), + (Some(id), None) => PrefixResolution::SingleMatch((id, lookup_pos)), (Some(_), Some(_)) => PrefixResolution::AmbiguousMatch, (None, _) => PrefixResolution::NoMatch, } From 4db25e6e4575cb7033fa9a7dc17551e3695d8121 Mon Sep 17 00:00:00 2001 From: Yuya Nishihara Date: Fri, 22 Dec 2023 16:57:21 +0900 Subject: [PATCH 4/5] index: implement index-level change id lookup methods These methods are basically the same as the commit_id versions, but resolve_change_id_prefix() is a bit more involved as we need to gather matches from multiple segments. --- lib/src/default_index/composite.rs | 67 +++++++++++ lib/src/default_index/mod.rs | 181 +++++++++++++++++++++++++++++ 2 files changed, 248 insertions(+) diff --git a/lib/src/default_index/composite.rs b/lib/src/default_index/composite.rs index 79f2aa6fcc..3e8eac08b9 100644 --- a/lib/src/default_index/composite.rs +++ b/lib/src/default_index/composite.rs @@ -200,6 +200,73 @@ impl<'a> CompositeIndex<'a> { .unwrap() } + /// Suppose the given `change_id` exists, returns the minimum prefix length + /// to disambiguate it within all the indexed ids including hidden ones. + #[cfg(test)] // TODO + pub(super) fn shortest_unique_change_id_prefix_len(&self, change_id: &ChangeId) -> usize { + let (prev_id, next_id) = self.resolve_neighbor_change_ids(change_id); + itertools::chain(prev_id, next_id) + .map(|id| hex_util::common_hex_len(change_id.as_bytes(), id.as_bytes()) + 1) + .max() + .unwrap_or(0) + } + + /// Suppose the given `change_id` exists, returns the previous and next + /// change ids in lexicographical order. The returned change ids may be + /// hidden. + #[cfg(test)] // TODO + pub(super) fn resolve_neighbor_change_ids( + &self, + change_id: &ChangeId, + ) -> (Option, Option) { + self.ancestor_index_segments() + .map(|segment| segment.resolve_neighbor_change_ids(change_id)) + .reduce(|(acc_prev_id, acc_next_id), (prev_id, next_id)| { + ( + acc_prev_id.into_iter().chain(prev_id).max(), + acc_next_id.into_iter().chain(next_id).min(), + ) + }) + .unwrap() + } + + /// Resolves the given change id `prefix` to the associated entries. The + /// returned entries may be hidden. + /// + /// The returned index positions are sorted in ascending order. + #[cfg(test)] // TODO + pub(super) fn resolve_change_id_prefix( + &self, + prefix: &HexPrefix, + ) -> PrefixResolution<(ChangeId, SmallIndexPositionsVec)> { + use PrefixResolution::*; + self.ancestor_index_segments() + .fold(NoMatch, |acc_match, segment| { + if acc_match == AmbiguousMatch { + return acc_match; // avoid checking the parent file(s) + } + let to_global_pos = { + let num_parent_commits = segment.num_parent_commits(); + move |LocalPosition(pos)| IndexPosition(pos + num_parent_commits) + }; + // Similar to PrefixResolution::plus(), but merges matches of the same id. + match (acc_match, segment.resolve_change_id_prefix(prefix)) { + (NoMatch, local_match) => local_match.map(|(id, positions)| { + (id, positions.into_iter().map(to_global_pos).collect()) + }), + (acc_match, NoMatch) => acc_match, + (AmbiguousMatch, _) => AmbiguousMatch, + (_, AmbiguousMatch) => AmbiguousMatch, + (SingleMatch((id1, _)), SingleMatch((id2, _))) if id1 != id2 => AmbiguousMatch, + (SingleMatch((id, mut acc_positions)), SingleMatch((_, local_positions))) => { + acc_positions + .insert_many(0, local_positions.into_iter().map(to_global_pos)); + SingleMatch((id, acc_positions)) + } + } + }) + } + pub(super) fn is_ancestor_pos( &self, ancestor_pos: IndexPosition, diff --git a/lib/src/default_index/mod.rs b/lib/src/default_index/mod.rs index a11c710cf0..b559794943 100644 --- a/lib/src/default_index/mod.rs +++ b/lib/src/default_index/mod.rs @@ -595,6 +595,9 @@ mod tests { let local_positions_vec = |positions: &[u32]| -> SmallLocalPositionsVec { positions.iter().copied().map(LocalPosition).collect() }; + let index_positions_vec = |positions: &[u32]| -> SmallIndexPositionsVec { + positions.iter().copied().map(IndexPosition).collect() + }; let id_0 = ChangeId::from_hex("00000001"); let id_1 = ChangeId::from_hex("00999999"); @@ -683,6 +686,76 @@ mod tests { mutable_segment.resolve_change_id_prefix(&HexPrefix::new("05555").unwrap()), PrefixResolution::AmbiguousMatch ); + + let index = mutable_segment.as_composite(); + + // Global lookup with the full hex digits + assert_eq!( + index.resolve_change_id_prefix(&HexPrefix::new(&id_0.hex()).unwrap()), + PrefixResolution::SingleMatch((id_0.clone(), index_positions_vec(&[0]))) + ); + assert_eq!( + index.resolve_change_id_prefix(&HexPrefix::new(&id_1.hex()).unwrap()), + PrefixResolution::SingleMatch((id_1.clone(), index_positions_vec(&[1, 3, 9]))) + ); + assert_eq!( + index.resolve_change_id_prefix(&HexPrefix::new(&id_2.hex()).unwrap()), + PrefixResolution::SingleMatch((id_2.clone(), index_positions_vec(&[2, 4, 5]))) + ); + assert_eq!( + index.resolve_change_id_prefix(&HexPrefix::new(&id_3.hex()).unwrap()), + PrefixResolution::SingleMatch((id_3.clone(), index_positions_vec(&[6, 7]))) + ); + assert_eq!( + index.resolve_change_id_prefix(&HexPrefix::new(&id_4.hex()).unwrap()), + PrefixResolution::SingleMatch((id_4.clone(), index_positions_vec(&[8]))) + ); + assert_eq!( + index.resolve_change_id_prefix(&HexPrefix::new(&id_5.hex()).unwrap()), + PrefixResolution::SingleMatch((id_5.clone(), index_positions_vec(&[10]))) + ); + + // Global lookup with unknown prefix + assert_eq!( + index.resolve_change_id_prefix(&HexPrefix::new("ffffffff").unwrap()), + PrefixResolution::NoMatch + ); + assert_eq!( + index.resolve_change_id_prefix(&HexPrefix::new("00000002").unwrap()), + PrefixResolution::NoMatch + ); + + // Global lookup with globally unique prefix + assert_eq!( + index.resolve_change_id_prefix(&HexPrefix::new("000").unwrap()), + PrefixResolution::SingleMatch((id_0.clone(), index_positions_vec(&[0]))) + ); + assert_eq!( + index.resolve_change_id_prefix(&HexPrefix::new("055553").unwrap()), + PrefixResolution::SingleMatch((id_5.clone(), index_positions_vec(&[10]))) + ); + + // Global lookup with globally unique prefix stored in both parts + assert_eq!( + index.resolve_change_id_prefix(&HexPrefix::new("009").unwrap()), + PrefixResolution::SingleMatch((id_1.clone(), index_positions_vec(&[1, 3, 9]))) + ); + + // Global lookup with locally ambiguous prefix + assert_eq!( + index.resolve_change_id_prefix(&HexPrefix::new("00").unwrap()), + PrefixResolution::AmbiguousMatch + ); + assert_eq!( + index.resolve_change_id_prefix(&HexPrefix::new("05555").unwrap()), + PrefixResolution::AmbiguousMatch + ); + + // Global lookup with locally unique but globally ambiguous prefix + assert_eq!( + index.resolve_change_id_prefix(&HexPrefix::new("0554").unwrap()), + PrefixResolution::AmbiguousMatch + ); } #[test] @@ -781,6 +854,114 @@ mod tests { mutable_segment.resolve_neighbor_change_ids(&ChangeId::from_hex("ffffffff")), (Some(id_4.clone()), None), ); + + let index = mutable_segment.as_composite(); + + // Global lookup, change_id exists. + // id_0 < id_1 < id_3 < id_2 < id_5 < id_4 + assert_eq!( + index.resolve_neighbor_change_ids(&id_0), + (None, Some(id_1.clone())), + ); + assert_eq!( + index.resolve_neighbor_change_ids(&id_1), + (Some(id_0.clone()), Some(id_3.clone())), + ); + assert_eq!( + index.resolve_neighbor_change_ids(&id_3), + (Some(id_1.clone()), Some(id_2.clone())), + ); + assert_eq!( + index.resolve_neighbor_change_ids(&id_2), + (Some(id_3.clone()), Some(id_5.clone())), + ); + assert_eq!( + index.resolve_neighbor_change_ids(&id_5), + (Some(id_2.clone()), Some(id_4.clone())), + ); + assert_eq!( + index.resolve_neighbor_change_ids(&id_4), + (Some(id_5.clone()), None), + ); + + // Global lookup, change_id doesn't exist. + // id_0 < id_1 < id_3 < id_2 < id_5 < id_4 + assert_eq!( + index.resolve_neighbor_change_ids(&ChangeId::from_hex("00000000")), + (None, Some(id_0.clone())), + ); + assert_eq!( + index.resolve_neighbor_change_ids(&ChangeId::from_hex("01000000")), + (Some(id_1.clone()), Some(id_3.clone())), + ); + assert_eq!( + index.resolve_neighbor_change_ids(&ChangeId::from_hex("05544555")), + (Some(id_3.clone()), Some(id_2.clone())), + ); + assert_eq!( + index.resolve_neighbor_change_ids(&ChangeId::from_hex("ffffffff")), + (Some(id_4.clone()), None), + ); + } + + #[test] + fn shortest_unique_change_id_prefix() { + let temp_dir = testutils::new_temp_dir(); + let mut new_commit_id = commit_id_generator(); + + let id_0 = ChangeId::from_hex("00000001"); + let id_1 = ChangeId::from_hex("00999999"); + let id_2 = ChangeId::from_hex("05548888"); + let id_3 = ChangeId::from_hex("05544444"); + let id_4 = ChangeId::from_hex("05555555"); + let id_5 = ChangeId::from_hex("05555333"); + + // Create some commits with different various common prefixes. + let mut mutable_segment = MutableIndexSegment::full(16, 4); + mutable_segment.add_commit_data(new_commit_id(), id_0.clone(), &[]); + mutable_segment.add_commit_data(new_commit_id(), id_1.clone(), &[]); + mutable_segment.add_commit_data(new_commit_id(), id_2.clone(), &[]); + mutable_segment.add_commit_data(new_commit_id(), id_1.clone(), &[]); + mutable_segment.add_commit_data(new_commit_id(), id_2.clone(), &[]); + mutable_segment.add_commit_data(new_commit_id(), id_2.clone(), &[]); + + // Write these commits to one file and build the remainder on top. + let initial_file = mutable_segment.save_in(temp_dir.path()).unwrap(); + mutable_segment = MutableIndexSegment::incremental(initial_file.clone()); + + mutable_segment.add_commit_data(new_commit_id(), id_3.clone(), &[]); + mutable_segment.add_commit_data(new_commit_id(), id_3.clone(), &[]); + mutable_segment.add_commit_data(new_commit_id(), id_4.clone(), &[]); + mutable_segment.add_commit_data(new_commit_id(), id_1.clone(), &[]); + mutable_segment.add_commit_data(new_commit_id(), id_5.clone(), &[]); + + let index = mutable_segment.as_composite(); + + // Calculate shortest unique prefix len with known change_id + assert_eq!(index.shortest_unique_change_id_prefix_len(&id_0), 3); + assert_eq!(index.shortest_unique_change_id_prefix_len(&id_1), 3); + assert_eq!(index.shortest_unique_change_id_prefix_len(&id_2), 5); + assert_eq!(index.shortest_unique_change_id_prefix_len(&id_3), 5); + assert_eq!(index.shortest_unique_change_id_prefix_len(&id_4), 6); + assert_eq!(index.shortest_unique_change_id_prefix_len(&id_5), 6); + + // Calculate shortest unique prefix len with unknown change_id + assert_eq!( + index.shortest_unique_change_id_prefix_len(&ChangeId::from_hex("00000002")), + 8 + ); + assert_eq!( + index.shortest_unique_change_id_prefix_len(&ChangeId::from_hex("01000000")), + 2 + ); + assert_eq!( + index.shortest_unique_change_id_prefix_len(&ChangeId::from_hex("05555344")), + 7 + ); + assert_eq!( + index.shortest_unique_change_id_prefix_len(&ChangeId::from_hex("ffffffff")), + 1 + ); } #[test] From 417a072ff36edd5bde0cfa365f39c4d1fa0f26e1 Mon Sep 17 00:00:00 2001 From: Yuya Nishihara Date: Fri, 22 Dec 2023 17:32:49 +0900 Subject: [PATCH 5/5] index: switch to persistent change id index MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The shortest change id prefix will become a few digits longer, but I think that's acceptable. Entries included in the "revsets.short-prefixes" set are unaffected. The reachable set is calculated eagerly, but this is still faster as we no longer need to sort the reachable entries by change id. The lazy version will save another ~100ms in mid-size repos. "jj log" without working copy snapshot: ``` % hyperfine --sort command --warmup 3 --runs 20 -L bin jj-0,jj-1,jj-2 \ -s "target/release-with-debug/{bin} -R ~/mirrors/linux debug reindex" \ "target/release-with-debug/{bin} -R ~/mirrors/linux \ --ignore-working-copy log -r.. -l100 --config-toml='revsets.short-prefixes=\"\"'" Benchmark 1: target/release-with-debug/jj-0 -R ~/mirrors/linux --ignore-working-copy log -r.. -l100 --config-toml='revsets.short-prefixes=""' Time (mean ± σ): 353.6 ms ± 11.9 ms [User: 266.7 ms, System: 87.0 ms] Range (min … max): 329.0 ms … 365.6 ms 20 runs Benchmark 2: target/release-with-debug/jj-1 -R ~/mirrors/linux --ignore-working-copy log -r.. -l100 --config-toml='revsets.short-prefixes=""' Time (mean ± σ): 271.3 ms ± 9.9 ms [User: 183.8 ms, System: 87.7 ms] Range (min … max): 250.5 ms … 282.7 ms 20 runs Relative speed comparison 1.99 ± 0.16 target/release-with-debug/jj-0 -R ~/mirrors/linux --ignore-working-copy log -r.. -l100 --config-toml='revsets.short-prefixes=""' 1.53 ± 0.12 target/release-with-debug/jj-1 -R ~/mirrors/linux --ignore-working-copy log -r.. -l100 --config-toml='revsets.short-prefixes=""' ``` "jj status" with working copy snapshot (watchman enabled): ``` % hyperfine --sort command --warmup 3 --runs 20 -L bin jj-0,jj-1,jj-2 \ -s "target/release-with-debug/{bin} -R ~/mirrors/linux debug reindex" \ "target/release-with-debug/{bin} -R ~/mirrors/linux \ status --config-toml='revsets.short-prefixes=\"\"'" Benchmark 1: target/release-with-debug/jj-0 -R ~/mirrors/linux status --config-toml='revsets.short-prefixes=""' Time (mean ± σ): 396.6 ms ± 10.1 ms [User: 300.7 ms, System: 94.0 ms] Range (min … max): 373.6 ms … 408.0 ms 20 runs Benchmark 2: target/release-with-debug/jj-1 -R ~/mirrors/linux status --config-toml='revsets.short-prefixes=""' Time (mean ± σ): 318.6 ms ± 12.6 ms [User: 219.1 ms, System: 94.1 ms] Range (min … max): 294.2 ms … 333.0 ms 20 runs Relative speed comparison 1.85 ± 0.14 target/release-with-debug/jj-0 -R ~/mirrors/linux status --config-toml='revsets.short-prefixes=""' 1.48 ± 0.12 target/release-with-debug/jj-1 -R ~/mirrors/linux status --config-toml='revsets.short-prefixes=""' ``` --- cli/tests/test_log_command.rs | 8 ++-- lib/src/default_index/composite.rs | 75 +++++++++++++++++++----------- lib/src/default_index/readonly.rs | 1 - lib/tests/test_index.rs | 17 ++----- 4 files changed, 56 insertions(+), 45 deletions(-) diff --git a/cli/tests/test_log_command.rs b/cli/tests/test_log_command.rs index 6721d29efb..1dd598e9d3 100644 --- a/cli/tests/test_log_command.rs +++ b/cli/tests/test_log_command.rs @@ -606,10 +606,10 @@ fn test_log_prefix_highlight_counts_hidden_commits() { insta::assert_snapshot!( test_env.jj_cmd_success(&repo_path, &["log", "-T", prefix_format]), @r###" - @ Change w[qnwkozpkust] 44[4c3c5066d3] - │ ◉ Change q[pvuntsmwlqt] initial ba[1a30916d29] original + @ Change wq[nwkozpkust] 44[4c3c5066d3] + │ ◉ Change qpv[untsmwlqt] initial ba[1a30916d29] original ├─╯ - ◉ Change z[zzzzzzzzzzz] 00[0000000000] + ◉ Change zzz[zzzzzzzzz] 00[0000000000] "### ); insta::assert_snapshot!( @@ -621,7 +621,7 @@ fn test_log_prefix_highlight_counts_hidden_commits() { insta::assert_snapshot!( test_env.jj_cmd_success(&repo_path, &["log", "-r", "44", "-T", prefix_format]), @r###" - @ Change w[qnwkozpkust] 44[4c3c5066d3] + @ Change wq[nwkozpkust] 44[4c3c5066d3] │ ~ "### diff --git a/lib/src/default_index/composite.rs b/lib/src/default_index/composite.rs index 3e8eac08b9..4cc773ee3c 100644 --- a/lib/src/default_index/composite.rs +++ b/lib/src/default_index/composite.rs @@ -30,7 +30,6 @@ use super::rev_walk::RevWalk; use super::revset_engine; use crate::backend::{ChangeId, CommitId}; use crate::hex_util; -use crate::id_prefix::{IdIndex, IdIndexSource, IdIndexSourceEntry}; use crate::index::{AllHeadsForGcUnsupported, ChangeIdIndex, Index}; use crate::object_id::{HexPrefix, ObjectId, PrefixResolution}; use crate::revset::{ResolvedExpression, Revset, RevsetEvaluationError}; @@ -202,7 +201,6 @@ impl<'a> CompositeIndex<'a> { /// Suppose the given `change_id` exists, returns the minimum prefix length /// to disambiguate it within all the indexed ids including hidden ones. - #[cfg(test)] // TODO pub(super) fn shortest_unique_change_id_prefix_len(&self, change_id: &ChangeId) -> usize { let (prev_id, next_id) = self.resolve_neighbor_change_ids(change_id); itertools::chain(prev_id, next_id) @@ -214,7 +212,6 @@ impl<'a> CompositeIndex<'a> { /// Suppose the given `change_id` exists, returns the previous and next /// change ids in lexicographical order. The returned change ids may be /// hidden. - #[cfg(test)] // TODO pub(super) fn resolve_neighbor_change_ids( &self, change_id: &ChangeId, @@ -234,7 +231,6 @@ impl<'a> CompositeIndex<'a> { /// returned entries may be hidden. /// /// The returned index positions are sorted in ascending order. - #[cfg(test)] // TODO pub(super) fn resolve_change_id_prefix( &self, prefix: &HexPrefix, @@ -500,50 +496,73 @@ impl Index for CompositeIndex<'_> { pub(super) struct ChangeIdIndexImpl { index: I, - pos_by_change: IdIndex, + reachable_bitset: Vec, } impl ChangeIdIndexImpl { pub fn new(index: I, heads: &mut dyn Iterator) -> ChangeIdIndexImpl { - let mut pos_by_change = IdIndex::builder(); + // TODO: Calculate reachable bitset lazily. let composite = index.as_composite(); + let bitset_len = + usize::try_from(u32::div_ceil(composite.num_commits(), u64::BITS)).unwrap(); + let mut reachable_bitset = vec![0; bitset_len]; // request zeroed page let head_positions = heads .map(|id| composite.commit_id_to_pos(id).unwrap()) .collect_vec(); for entry in composite.walk_revs(&head_positions, &[]) { - pos_by_change.insert(&entry.change_id(), entry.position()); + let IndexPosition(pos) = entry.position(); + let bitset_pos = pos / u64::BITS; + let bit = 1_u64 << (pos % u64::BITS); + reachable_bitset[usize::try_from(bitset_pos).unwrap()] |= bit; } - Self { + ChangeIdIndexImpl { index, - pos_by_change: pos_by_change.build(), + reachable_bitset, } } } impl ChangeIdIndex for ChangeIdIndexImpl { + /// Resolves change id prefix among all ids, then filters out hidden + /// entries. + /// + /// If `SingleMatch` is returned, the commits including in the set are all + /// visible. `AmbiguousMatch` may be returned even if the prefix is unique + /// within the visible entries. fn resolve_prefix(&self, prefix: &HexPrefix) -> PrefixResolution> { - self.pos_by_change - .resolve_prefix_with(self.index.as_composite(), prefix, |entry| entry.commit_id()) - .map(|(_, commit_ids)| commit_ids) + let index = self.index.as_composite(); + match index.resolve_change_id_prefix(prefix) { + PrefixResolution::NoMatch => PrefixResolution::NoMatch, + PrefixResolution::SingleMatch((_change_id, positions)) => { + let reachable_commit_ids = positions + .iter() + .filter(|IndexPosition(pos)| { + let bitset_pos = pos / u64::BITS; + let bit = 1_u64 << (pos % u64::BITS); + let bits = self.reachable_bitset[usize::try_from(bitset_pos).unwrap()]; + bits & bit != 0 + }) + .map(|&pos| index.entry_by_pos(pos).commit_id()) + .collect_vec(); + if reachable_commit_ids.is_empty() { + PrefixResolution::NoMatch + } else { + PrefixResolution::SingleMatch(reachable_commit_ids) + } + } + PrefixResolution::AmbiguousMatch => PrefixResolution::AmbiguousMatch, + } } + /// Calculates the shortest prefix length of the given `change_id` among all + /// ids including hidden entries. + /// + /// The returned length is usually a few digits longer than the minimum + /// length to disambiguate within the visible entries. fn shortest_unique_prefix_len(&self, change_id: &ChangeId) -> usize { - self.pos_by_change - .shortest_unique_prefix_len(self.index.as_composite(), change_id) - } -} - -impl<'index> IdIndexSource for CompositeIndex<'index> { - type Entry = IndexEntry<'index>; - - fn entry_at(&self, pointer: &IndexPosition) -> Self::Entry { - self.entry_by_pos(*pointer) - } -} - -impl IdIndexSourceEntry for IndexEntry<'_> { - fn to_key(&self) -> ChangeId { - self.change_id() + self.index + .as_composite() + .shortest_unique_change_id_prefix_len(change_id) } } diff --git a/lib/src/default_index/readonly.rs b/lib/src/default_index/readonly.rs index 688729ff15..7b7157c4ae 100644 --- a/lib/src/default_index/readonly.rs +++ b/lib/src/default_index/readonly.rs @@ -652,7 +652,6 @@ impl ReadonlyIndex for DefaultReadonlyIndex { self } - // TODO: Create a persistent lookup from change id to commit ids. fn change_id_index( &self, heads: &mut dyn Iterator, diff --git a/lib/tests/test_index.rs b/lib/tests/test_index.rs index f08b420959..16a7f9ce39 100644 --- a/lib/tests/test_index.rs +++ b/lib/tests/test_index.rs @@ -821,14 +821,10 @@ fn test_change_id_index() { // No match assert_eq!(resolve_prefix("ba"), PrefixResolution::NoMatch); - // Test with a revset containing only some of the commits. We should get shorter - // prefixes and be able to resolve shorter prefixes. + // Test with an index containing only some of the commits. The shortest + // length doesn't have to be minimized further, but unreachable commits + // should never be included in the resolved set. let change_id_index = index_for_heads(&[&commit_1, &commit_2]); - let prefix_len = - |commit: &Commit| change_id_index.shortest_unique_prefix_len(commit.change_id()); - assert_eq!(prefix_len(&commit_1), 2); - assert_eq!(prefix_len(&commit_2), 2); - assert_eq!(prefix_len(&commit_3), 6); let resolve_prefix = |prefix: &str| { change_id_index .resolve_prefix(&HexPrefix::new(prefix).unwrap()) @@ -839,13 +835,10 @@ fn test_change_id_index() { PrefixResolution::SingleMatch(hashset! {root_commit.id().clone()}) ); assert_eq!( - resolve_prefix("aa"), + resolve_prefix("aaaaab"), PrefixResolution::SingleMatch(hashset! {commit_2.id().clone()}) ); - assert_eq!( - resolve_prefix("ab"), - PrefixResolution::SingleMatch(hashset! {commit_1.id().clone()}) - ); + assert_eq!(resolve_prefix("aaaaaa"), PrefixResolution::NoMatch); assert_eq!(resolve_prefix("a"), PrefixResolution::AmbiguousMatch); assert_eq!(resolve_prefix("b"), PrefixResolution::NoMatch); }