Fixes and docs updates

pisa-engine · May 18, 2024 · 6178ca5 · 6178ca5
1 parent 686541f
commit 6178ca5
Show file tree

Hide file tree

Showing 29 changed files with 222 additions and 186 deletions.
diff --git a/include/pisa/codec/block_codec.hpp b/include/pisa/codec/block_codec.hpp
@@ -8,7 +8,7 @@
 namespace pisa {
 
 /**
- * Block codecs encode and decode an entire list. This is in opposition to a streaming codec,
+ * Block codecs encode and decode a list of integers. This is in opposition to a streaming codec,
  * which can encode and decode values one by one.
  */
 class BlockCodec {
@@ -32,10 +32,13 @@ class BlockCodec {
      * Returns the block size of the encoding.
      *
      * Block codecs write blocks of fixed size, e.g., 128 integers. Thus, it is only possible to
-     * encode at most `block_size()` elements.
+     * encode at most `block_size()` elements with a single `encode` call.
      */
     [[nodiscard]] virtual auto block_size() const noexcept -> std::size_t = 0;
 
+    /**
+     * Returns the name of the codec.
+     */
     [[nodiscard]] virtual auto get_name() const noexcept -> std::string_view = 0;
 };
 

diff --git a/include/pisa/codec/block_codec_registry.hpp b/include/pisa/codec/block_codec_registry.hpp
@@ -7,16 +7,6 @@
 #include <gsl/span>
 
 #include "codec/block_codec.hpp"
-#include "codec/interpolative.hpp"
-#include "codec/maskedvbyte.hpp"
-#include "codec/optpfor.hpp"
-#include "codec/qmx.hpp"
-#include "codec/simdbp.hpp"
-#include "codec/simple16.hpp"
-#include "codec/simple8b.hpp"
-#include "codec/streamvbyte.hpp"
-#include "codec/varint_g8iu.hpp"
-#include "codec/varintgb.hpp"
 
 namespace pisa {
 
@@ -44,24 +34,16 @@ struct BlockCodecRegistry {
     }
 };
 
-using BlockCodecs = BlockCodecRegistry<
-    InterpolativeBlockCodec,
-    MaskedVByteBlockCodec,
-    OptPForBlockCodec,
-    QmxBlockCodec,
-    SimdBpBlockCodec,
-    Simple16BlockCodec,
-    Simple8bBlockCodec,
-    StreamVByteBlockCodec,
-    VarintG8IUBlockCodec,
-    VarintGbBlockCodec>;
-
-[[nodiscard]] auto get_block_codec(std::string_view name) -> BlockCodecPtr {
-    return BlockCodecs::get(name);
-}
-
-[[nodiscard]] constexpr auto get_block_codec_names() -> gsl::span<std::string_view const> {
-    return gsl::make_span<std::string_view const>(&BlockCodecs::names[0], BlockCodecs::count());
-}
+/**
+ * Resolves a block codec from a name and returns a shared pointer to the created object.
+ *
+ * If the name is not recognized, `nullptr` is returned.
+ */
+[[nodiscard]] auto get_block_codec(std::string_view name) -> BlockCodecPtr;
+
+/**
+ * Lists the names of all known block codecs.
+ */
+[[nodiscard]] constexpr auto get_block_codec_names() -> gsl::span<std::string_view const>;
 
 }  // namespace pisa
diff --git a/include/pisa/codec/interpolative.hpp b/include/pisa/codec/interpolative.hpp
@@ -7,6 +7,12 @@
 
 namespace pisa {
 
+/**
+ * Interpolative coding.
+ *
+ * Alistair Moffat, Lang Stuiver: Binary Interpolative Coding for Effective Index Compression. Inf.
+ * Retr. 3(1): 25-47 (2000)
+ */
 class InterpolativeBlockCodec: public BlockCodec {
     static constexpr std::uint64_t m_block_size = 128;
 

diff --git a/include/pisa/codec/maskedvbyte.hpp b/include/pisa/codec/maskedvbyte.hpp
@@ -6,6 +6,12 @@
 
 namespace pisa {
 
+/**
+ * Masked VByte coding.
+ *
+ * Jeff Plaisance, Nathan Kurz, Daniel Lemire, Vectorized VByte Decoding, International Symposium on
+ * Web Algorithms 2015, 2015.
+ */
 class MaskedVByteBlockCodec: public BlockCodec {
     static constexpr std::uint64_t m_block_size = 128;
     static constexpr std::uint64_t m_overflow = 512;

diff --git a/include/pisa/codec/optpfor.hpp b/include/pisa/codec/optpfor.hpp
@@ -8,39 +8,17 @@
 
 namespace pisa {
 
+/**
+ * OptPForDelta coding.
+ *
+ * Hao Yan, Shuai Ding, and Torsten Suel. 2009. Inverted index compression and query processing with
+ * optimized document ordering. In Proceedings of the 18th international conference on World wide
+ * web (WWW '09). ACM, New York, NY, USA, 401-410. DOI: https://doi.org/10.1145/1526709.1526764
+ */
 class OptPForBlockCodec: public BlockCodec {
     struct Codec: FastPForLib::OPTPFor<4, FastPForLib::Simple16<false>> {
-        uint8_t const* force_b{nullptr};
-
-        uint32_t findBestB(const uint32_t* in, uint32_t len) {
-            // trick to force the choice of b from a parameter
-            if (force_b != nullptr) {
-                return *force_b;
-            }
-
-            // this is mostly a cut&paste from FastPFor, but we stop the
-            // optimization early as the b to test becomes larger than maxb
-            uint32_t b = 0;
-            uint32_t bsize = std::numeric_limits<uint32_t>::max();
-            const uint32_t mb = FastPForLib::maxbits(in, in + len);
-            uint32_t i = 0;
-            while (mb > 28 + possLogs[i]) {
-                ++i;  // some schemes such as Simple16 don't code numbers greater than 28
-            }
-
-            for (; i < possLogs.size(); i++) {
-                if (possLogs[i] > mb && possLogs[i] >= mb) {
-                    break;
-                }
-                const uint32_t csize = tryB(possLogs[i], in, len);
-
-                if (csize <= bsize) {
-                    b = possLogs[i];
-                    bsize = csize;
-                }
-            }
-            return b;
-        }
+        uint8_t const* force_b = nullptr;
+        uint32_t findBestB(const uint32_t* in, uint32_t len);
     };
 
     static const uint64_t m_block_size = Codec::BlockSize;

diff --git a/include/pisa/codec/qmx.hpp b/include/pisa/codec/qmx.hpp
@@ -4,6 +4,14 @@
 
 namespace pisa {
 
+/**
+ * Quantities, Multipliers, and eXtractor (QMX) coding.
+ *
+ * Andrew Trotman. 2014. Compression, SIMD, and Postings Lists. In Proceedings of the 2014
+ * Australasian Document Computing Symposium (ADCS '14), J. Shane Culpepper, Laurence Park, and
+ * Guido Zuccon (Eds.). ACM, New York, NY, USA, Pages 50, 8 pages. DOI:
+ * https://doi.org/10.1145/2682862.2682870
+ */
 class QmxBlockCodec: public BlockCodec {
     static constexpr std::uint64_t m_block_size = 128;
     static constexpr std::uint64_t m_overflow = 512;

diff --git a/include/pisa/codec/simdbp.hpp b/include/pisa/codec/simdbp.hpp
@@ -6,6 +6,12 @@
 
 namespace pisa {
 
+/**
+ * SIMD-BP128 coding.
+ *
+ * Daniel Lemire, Leonid Boytsov: Decoding billions of integers per second through vectorization.
+ * Softw., Pract. Exper. 45(1): 1-29 (2015)
+ */
 class SimdBpBlockCodec: public BlockCodec {
     static constexpr std::uint64_t m_block_size = 128;
 

diff --git a/include/pisa/codec/simple16.hpp b/include/pisa/codec/simple16.hpp
@@ -4,6 +4,13 @@
 
 namespace pisa {
 
+/**
+ * Simple16 coding.
+ *
+ * Jiangong Zhang, Xiaohui Long, and Torsten Suel. 2008. Performance of compressed inverted list
+ * caching in search engines. In Proceedings of the 17th international conference on World Wide Web
+ * (WWW '08). ACM, New York, NY, USA, 387-396. DOI: https://doi.org/10.1145/1367497.1367550
+ */
 class Simple16BlockCodec: public BlockCodec {
     static constexpr std::uint64_t m_block_size = 128;
 

diff --git a/include/pisa/codec/simple8b.hpp b/include/pisa/codec/simple8b.hpp
@@ -4,6 +4,12 @@
 
 namespace pisa {
 
+/**
+ * Simple8b coding.
+ *
+ * Vo Ngoc Anh, Alistair Moffat: Index compression using 64-bit words. Softw., Pract. Exper. 40(2):
+ * 131-147 (2010)
+ */
 class Simple8bBlockCodec: public BlockCodec {
     static constexpr std::uint64_t m_block_size = 128;
 

diff --git a/include/pisa/codec/streamvbyte.hpp b/include/pisa/codec/streamvbyte.hpp
@@ -18,6 +18,12 @@ constexpr std::size_t streamvbyte_max_compressedbytes(std::uint32_t length) {
     return cb + db;
 }
 
+/**
+ * StreamVByte coding.
+ *
+ * Daniel Lemire, Nathan Kurz, Christoph Rupp: Stream VByte: Faster byte-oriented integer
+ * compression. Inf. Process. Lett. 130: 1-6 (2018). DOI: https://doi.org/10.1016/j.ipl.2017.09.011
+ */
 class StreamVByteBlockCodec: public BlockCodec {
     static constexpr std::uint64_t m_block_size = 128;
     static constexpr std::size_t m_max_compressed_bytes =

diff --git a/include/pisa/codec/varint_g8iu.hpp b/include/pisa/codec/varint_g8iu.hpp
@@ -7,6 +7,15 @@
 
 namespace pisa {
 
+/**
+ * Varint-G8IU coding.
+ *
+ * Alexander A. Stepanov, Anil R. Gangolli, Daniel E. Rose, Ryan J. Ernst, and Paramjit S. Oberoi.
+ * 2011. SIMD-based decoding of posting lists. In Proceedings of the 20th ACM international
+ * conference on Information and knowledge management (CIKM '11), Bettina Berendt, Arjen de Vries,
+ * Wenfei Fan, Craig Macdonald, Iadh Ounis, and Ian Ruthven (Eds.). ACM, New York, NY, USA, 317-326.
+ * DOI: https://doi.org/10.1145/2063576.2063627
+ */
 class VarintG8IUBlockCodec: public BlockCodec {
     static const uint64_t m_block_size = 128;
 

diff --git a/include/pisa/codec/varintgb.hpp b/include/pisa/codec/varintgb.hpp
@@ -228,6 +228,14 @@ class VarIntGB {
     }
 };
 
+/**
+ * VarintGB coding.
+ *
+ * Jeffrey Dean. 2009. Challenges in building large-scale information retrieval systems: invited
+ * talk. In Proceedings of the Second ACM International Conference on Web Search and Data Mining
+ * (WSDM '09), Ricardo Baeza-Yates, Paolo Boldi, Berthier Ribeiro-Neto, and B. Barla Cambazoglu
+ * (Eds.). ACM, New York, NY, USA, 1-1. DOI: http://dx.doi.org/10.1145/1498759.1498761
+ */
 class VarintGbBlockCodec: public BlockCodec {
     static constexpr std::uint64_t m_block_size = 128;
 

diff --git a/include/pisa/cursor/scored_cursor.hpp b/include/pisa/cursor/scored_cursor.hpp
@@ -1,7 +1,5 @@
 #pragma once
 
-#include <vector>
-
 #include "concepts.hpp"
 #include "concepts/posting_cursor.hpp"
 #include "query.hpp"

diff --git a/include/pisa/scorer/bm25.hpp b/include/pisa/scorer/bm25.hpp
@@ -5,6 +5,7 @@
 #include <cstdint>
 
 #include "index_scorer.hpp"
+
 namespace pisa {
 
 /// Implements the Okapi BM25 model. k1 and b are both free parameters which
@@ -14,11 +15,11 @@ namespace pisa {
 /// in Proceedings of the SIGIR 2012 Workshop on Open Source Information
 /// Retrieval (OSIR), 2012.
 template <typename Wand>
-struct bm25: public index_scorer<Wand> {
-    using index_scorer<Wand>::index_scorer;
+struct bm25: public WandIndexScorer<Wand> {
+    using WandIndexScorer<Wand>::WandIndexScorer;
 
     bm25(const Wand& wdata, const float b, const float k1)
-        : index_scorer<Wand>(wdata), m_b(b), m_k1(k1) {}
+        : WandIndexScorer<Wand>(wdata), m_b(b), m_k1(k1) {}
 
     float doc_term_weight(uint64_t freq, float norm_len) const {
         auto f = static_cast<float>(freq);
@@ -33,7 +34,7 @@ struct bm25: public index_scorer<Wand> {
         return std::max(epsilon_score, idf) * (1.0F + m_k1);
     }
 
-    term_scorer_t term_scorer(uint64_t term_id) const override {
+    TermScorer term_scorer(uint64_t term_id) const override {
         auto term_len = this->m_wdata.term_posting_count(term_id);
         auto term_weight = query_term_weight(term_len, this->m_wdata.num_docs());
         auto s = [&, term_weight](uint32_t doc, uint32_t freq) {

diff --git a/include/pisa/scorer/dph.hpp b/include/pisa/scorer/dph.hpp
@@ -2,9 +2,6 @@
 
 #define _USE_MATH_DEFINES
 
-#include <cmath>
-
-#include <algorithm>
 #include <cmath>
 #include <cstdint>
 
@@ -18,10 +15,10 @@ namespace pisa {
 /// Vergata at TREC 2007 Blog Track," in Proceedings of the 16th Text REtrieval
 /// Conference (TREC), 2007.
 template <typename Wand>
-struct dph: public index_scorer<Wand> {
-    using index_scorer<Wand>::index_scorer;
+struct dph: public WandIndexScorer<Wand> {
+    using WandIndexScorer<Wand>::WandIndexScorer;
 
-    term_scorer_t term_scorer(uint64_t term_id) const override {
+    TermScorer term_scorer(uint64_t term_id) const override {
         auto s = [&, term_id](uint32_t doc, uint32_t freq) {
             float f = (float)freq / this->m_wdata.doc_len(doc);
             float norm = (1.F - f) * (1.F - f) / (freq + 1.F);

diff --git a/include/pisa/scorer/index_scorer.hpp b/include/pisa/scorer/index_scorer.hpp
@@ -5,26 +5,27 @@
 
 namespace pisa {
 
-using term_scorer_t = std::function<float(uint32_t, uint32_t)>;
 using TermScorer = std::function<float(uint32_t, uint32_t)>;
 
+/** Index scorer construct scorers for terms in the index. */
 class IndexScorer {
   public:
     virtual TermScorer term_scorer(std::uint64_t term_id) const = 0;
 };
 
+/** Index scorer using WAND metadata for scoring. */
 template <typename Wand>
-struct index_scorer: IndexScorer {
+struct WandIndexScorer: IndexScorer {
   protected:
     const Wand& m_wdata;
 
   public:
-    explicit index_scorer(const Wand& wdata) : m_wdata(wdata) {}
-    index_scorer(index_scorer const&) = default;
-    index_scorer(index_scorer&&) noexcept = default;
-    index_scorer& operator=(index_scorer const&) = delete;
-    index_scorer& operator=(index_scorer&&) noexcept = delete;
-    virtual ~index_scorer() = default;
+    explicit WandIndexScorer(const Wand& wdata) : m_wdata(wdata) {}
+    WandIndexScorer(WandIndexScorer const&) = default;
+    WandIndexScorer(WandIndexScorer&&) noexcept = default;
+    WandIndexScorer& operator=(WandIndexScorer const&) = delete;
+    WandIndexScorer& operator=(WandIndexScorer&&) noexcept = delete;
+    virtual ~WandIndexScorer() = default;
 };
 
 }  // namespace pisa
diff --git a/include/pisa/scorer/pl2.hpp b/include/pisa/scorer/pl2.hpp
@@ -4,7 +4,6 @@
 
 #include <cmath>
 
-#include <algorithm>
 #include <cmath>
 #include <cstdint>
 
@@ -17,12 +16,12 @@ namespace pisa {
 /// G. Amati: "Probabalistic models for information retrieval based on
 /// divergence from randomness." PhD Thesis, University of Glasgow, 2003.
 template <typename Wand>
-struct pl2: public index_scorer<Wand> {
-    using index_scorer<Wand>::index_scorer;
+struct pl2: public WandIndexScorer<Wand> {
+    using WandIndexScorer<Wand>::WandIndexScorer;
 
-    pl2(const Wand& wdata, const float c) : index_scorer<Wand>(wdata), m_c(c) {}
+    pl2(const Wand& wdata, const float c) : WandIndexScorer<Wand>(wdata), m_c(c) {}
 
-    term_scorer_t term_scorer(uint64_t term_id) const override {
+    TermScorer term_scorer(uint64_t term_id) const override {
         auto s = [&, term_id](uint32_t doc, uint32_t freq) {
             float tfn =
                 freq * std::log2(1.F + (m_c * this->m_wdata.avg_len()) / this->m_wdata.doc_len(doc));

diff --git a/include/pisa/scorer/qld.hpp b/include/pisa/scorer/qld.hpp
@@ -17,12 +17,12 @@ namespace pisa {
 /// Language Models Applied to Ad Hoc Information Retrieval," in Proceedings of
 /// SIGIR, 2001.
 template <typename Wand>
-struct qld: public index_scorer<Wand> {
-    using index_scorer<Wand>::index_scorer;
+struct qld: public WandIndexScorer<Wand> {
+    using WandIndexScorer<Wand>::WandIndexScorer;
 
-    qld(const Wand& wdata, const float mu) : index_scorer<Wand>(wdata), m_mu(mu) {}
+    qld(const Wand& wdata, const float mu) : WandIndexScorer<Wand>(wdata), m_mu(mu) {}
 
-    term_scorer_t term_scorer(uint64_t term_id) const override {
+    TermScorer term_scorer(uint64_t term_id) const override {
         float mu = this->m_mu;
         float collection_len = this->m_wdata.collection_len();
         float term_occurrences = this->m_wdata.term_occurrence_count(term_id);