Skip to content

Commit

Permalink
add API contract verfication
Browse files Browse the repository at this point in the history
  • Loading branch information
PSeitz committed Nov 13, 2024
1 parent 32b0d86 commit ec6f5b3
Show file tree
Hide file tree
Showing 4 changed files with 42 additions and 11 deletions.
28 changes: 22 additions & 6 deletions src/docset.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ pub trait DocSet: Send {
/// of `DocSet` should support it.
///
/// Calling `seek(TERMINATED)` is also legal and is the normal way to consume a `DocSet`.
///
/// `target` has to be larger or equal to `.doc()` when calling `seek`.
fn seek(&mut self, target: DocId) -> DocId {
let mut doc = self.doc();
debug_assert!(doc <= target);
Expand All @@ -58,11 +60,22 @@ pub trait DocSet: Send {
///
/// ## API Behaviour
/// If `seek_exact` is returning true, a call to `doc()` has to return target.
/// If `seek_exact` is returning false, a call to `doc()` may return the previous doc,
/// which may be lower than target.
/// If `seek_exact` is returning false, a call to `doc()` may return any doc and should not be
/// used until `seek_exact` returns true again. The DocSet is considered to be in an invalid
/// state until `seek_exact` returns true again.
///
/// target needs to be equal or larger than `doc` when in a valid state.
///
/// Consecutive calls are not allowed to have decreasing `target` values.
///
/// # Warning
/// This is an advanced API used by intersection. The API contract is tricky, avoid using it.
fn seek_exact(&mut self, target: DocId) -> bool {
let doc = self.seek(target);
doc == target
let current_doc = self.doc();
if current_doc < target {
self.seek(target);
}
self.doc() == target
}

/// Fills a given mutable buffer with the next doc ids from the
Expand Down Expand Up @@ -103,8 +116,11 @@ pub trait DocSet: Send {
/// length of the docset.
fn size_hint(&self) -> u32;

/// Returns a best-effort hint of the
/// cost to drive the docset.
/// Returns a best-effort hint of the cost to consume the entire docset.
///
/// Consuming means calling advance until [`TERMINATED`] is returned.
/// The cost should be relative to the cost of driving a Term query,
/// which would be the number of documents in the DocSet.
///
/// By default this returns `size_hint()`.
///
Expand Down
12 changes: 9 additions & 3 deletions src/query/intersection.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,13 @@ use crate::{DocId, Score};
/// For better performance, the function uses a
/// specialized implementation if the two
/// shortest scorers are `TermScorer`s.
pub fn intersect_scorers(mut scorers: Vec<Box<dyn Scorer>>, num_docs: u32) -> Box<dyn Scorer> {
///
/// num_docs_segment is the number of documents in the segment. It is used for estimating the
/// `size_hint` of the intersection.
pub fn intersect_scorers(
mut scorers: Vec<Box<dyn Scorer>>,
num_docs_segment: u32,
) -> Box<dyn Scorer> {
if scorers.is_empty() {
return Box::new(EmptyScorer);
}
Expand All @@ -35,14 +41,14 @@ pub fn intersect_scorers(mut scorers: Vec<Box<dyn Scorer>>, num_docs: u32) -> Bo
left: *(left.downcast::<TermScorer>().map_err(|_| ()).unwrap()),
right: *(right.downcast::<TermScorer>().map_err(|_| ()).unwrap()),
others: scorers,
num_docs,
num_docs: num_docs_segment,
});
}
Box::new(Intersection {
left,
right,
others: scorers,
num_docs,
num_docs: num_docs_segment,
})
}

Expand Down
9 changes: 8 additions & 1 deletion src/query/range_query/fast_field_range_doc_set.rs
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,14 @@ impl<T: Send + Sync + PartialOrd + Copy + Debug + 'static> DocSet for RangeDocSe
fn cost(&self) -> u64 {
// Advancing the docset is pretty expensive since it scans the whole column, there is no
// index currently (will change with an kd-tree)
// Since we use SIMD to scan the fast field range query we lower the cost a little bit.
// Since we use SIMD to scan the fast field range query we lower the cost a little bit,
// assuming that we hit 10% of the docs like in size_hint.
//
// If we would return a cost higher than num_docs, we would never choose ff range query as
// the driver in a DocSet, when intersecting a term query with a fast field. But
// it's the faster choice when the term query has a lot of docids and the range
// query has not.
//
// Ideally this would take the fast field codec into account
(self.column.num_docs() as f64 * 0.8) as u64
}
Expand Down
4 changes: 3 additions & 1 deletion src/query/size_hint.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@
/// The estimated number of documents in the intersection.
pub fn estimate_intersection<I>(mut docset_sizes: I, max_docs: u32) -> u32
where I: Iterator<Item = u32> {
if max_doc == 0u32 { return 0u32; }
if max_docs == 0u32 {
return 0u32;
}
// Terms tend to be not really randomly distributed.
// This factor is used to adjust the estimate.
let mut co_loc_factor: f64 = 1.3;
Expand Down

0 comments on commit ec6f5b3

Please sign in to comment.