diff --git a/src/docset.rs b/src/docset.rs index 2242c81ee0..e8799b2e9d 100644 --- a/src/docset.rs +++ b/src/docset.rs @@ -40,6 +40,8 @@ pub trait DocSet: Send { /// of `DocSet` should support it. /// /// Calling `seek(TERMINATED)` is also legal and is the normal way to consume a `DocSet`. + /// + /// `target` has to be larger or equal to `.doc()` when calling `seek`. fn seek(&mut self, target: DocId) -> DocId { let mut doc = self.doc(); debug_assert!(doc <= target); @@ -58,11 +60,22 @@ pub trait DocSet: Send { /// /// ## API Behaviour /// If `seek_exact` is returning true, a call to `doc()` has to return target. - /// If `seek_exact` is returning false, a call to `doc()` may return the previous doc, - /// which may be lower than target. + /// If `seek_exact` is returning false, a call to `doc()` may return any doc and should not be + /// used until `seek_exact` returns true again. The DocSet is considered to be in an invalid + /// state until `seek_exact` returns true again. + /// + /// target needs to be equal or larger than `doc` when in a valid state. + /// + /// Consecutive calls are not allowed to have decreasing `target` values. + /// + /// # Warning + /// This is an advanced API used by intersection. The API contract is tricky, avoid using it. fn seek_exact(&mut self, target: DocId) -> bool { - let doc = self.seek(target); - doc == target + let current_doc = self.doc(); + if current_doc < target { + self.seek(target); + } + self.doc() == target } /// Fills a given mutable buffer with the next doc ids from the @@ -103,8 +116,11 @@ pub trait DocSet: Send { /// length of the docset. fn size_hint(&self) -> u32; - /// Returns a best-effort hint of the - /// cost to drive the docset. + /// Returns a best-effort hint of the cost to consume the entire docset. + /// + /// Consuming means calling advance until [`TERMINATED`] is returned. + /// The cost should be relative to the cost of driving a Term query, + /// which would be the number of documents in the DocSet. /// /// By default this returns `size_hint()`. /// diff --git a/src/query/intersection.rs b/src/query/intersection.rs index 01a435b43b..7eac79184b 100644 --- a/src/query/intersection.rs +++ b/src/query/intersection.rs @@ -12,7 +12,13 @@ use crate::{DocId, Score}; /// For better performance, the function uses a /// specialized implementation if the two /// shortest scorers are `TermScorer`s. -pub fn intersect_scorers(mut scorers: Vec>, num_docs: u32) -> Box { +/// +/// num_docs_segment is the number of documents in the segment. It is used for estimating the +/// `size_hint` of the intersection. +pub fn intersect_scorers( + mut scorers: Vec>, + num_docs_segment: u32, +) -> Box { if scorers.is_empty() { return Box::new(EmptyScorer); } @@ -35,14 +41,14 @@ pub fn intersect_scorers(mut scorers: Vec>, num_docs: u32) -> Bo left: *(left.downcast::().map_err(|_| ()).unwrap()), right: *(right.downcast::().map_err(|_| ()).unwrap()), others: scorers, - num_docs, + num_docs: num_docs_segment, }); } Box::new(Intersection { left, right, others: scorers, - num_docs, + num_docs: num_docs_segment, }) } diff --git a/src/query/range_query/fast_field_range_doc_set.rs b/src/query/range_query/fast_field_range_doc_set.rs index b9bc25d6f8..7036095050 100644 --- a/src/query/range_query/fast_field_range_doc_set.rs +++ b/src/query/range_query/fast_field_range_doc_set.rs @@ -183,7 +183,14 @@ impl DocSet for RangeDocSe fn cost(&self) -> u64 { // Advancing the docset is pretty expensive since it scans the whole column, there is no // index currently (will change with an kd-tree) - // Since we use SIMD to scan the fast field range query we lower the cost a little bit. + // Since we use SIMD to scan the fast field range query we lower the cost a little bit, + // assuming that we hit 10% of the docs like in size_hint. + // + // If we would return a cost higher than num_docs, we would never choose ff range query as + // the driver in a DocSet, when intersecting a term query with a fast field. But + // it's the faster choice when the term query has a lot of docids and the range + // query has not. + // // Ideally this would take the fast field codec into account (self.column.num_docs() as f64 * 0.8) as u64 } diff --git a/src/query/size_hint.rs b/src/query/size_hint.rs index 310b02cc1d..3d2811d40a 100644 --- a/src/query/size_hint.rs +++ b/src/query/size_hint.rs @@ -10,7 +10,9 @@ /// The estimated number of documents in the intersection. pub fn estimate_intersection(mut docset_sizes: I, max_docs: u32) -> u32 where I: Iterator { - if max_doc == 0u32 { return 0u32; } + if max_docs == 0u32 { + return 0u32; + } // Terms tend to be not really randomly distributed. // This factor is used to adjust the estimate. let mut co_loc_factor: f64 = 1.3;