Skip to content

Commit

Permalink
Fixes and docs updates
Browse files Browse the repository at this point in the history
  • Loading branch information
elshize committed May 18, 2024
1 parent 686541f commit 6178ca5
Show file tree
Hide file tree
Showing 29 changed files with 222 additions and 186 deletions.
7 changes: 5 additions & 2 deletions include/pisa/codec/block_codec.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
namespace pisa {

/**
* Block codecs encode and decode an entire list. This is in opposition to a streaming codec,
* Block codecs encode and decode a list of integers. This is in opposition to a streaming codec,
* which can encode and decode values one by one.
*/
class BlockCodec {
Expand All @@ -32,10 +32,13 @@ class BlockCodec {
* Returns the block size of the encoding.
*
* Block codecs write blocks of fixed size, e.g., 128 integers. Thus, it is only possible to
* encode at most `block_size()` elements.
* encode at most `block_size()` elements with a single `encode` call.
*/
[[nodiscard]] virtual auto block_size() const noexcept -> std::size_t = 0;

/**
* Returns the name of the codec.
*/
[[nodiscard]] virtual auto get_name() const noexcept -> std::string_view = 0;
};

Expand Down
40 changes: 11 additions & 29 deletions include/pisa/codec/block_codec_registry.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,6 @@
#include <gsl/span>

#include "codec/block_codec.hpp"
#include "codec/interpolative.hpp"
#include "codec/maskedvbyte.hpp"
#include "codec/optpfor.hpp"
#include "codec/qmx.hpp"
#include "codec/simdbp.hpp"
#include "codec/simple16.hpp"
#include "codec/simple8b.hpp"
#include "codec/streamvbyte.hpp"
#include "codec/varint_g8iu.hpp"
#include "codec/varintgb.hpp"

namespace pisa {

Expand Down Expand Up @@ -44,24 +34,16 @@ struct BlockCodecRegistry {
}
};

using BlockCodecs = BlockCodecRegistry<
InterpolativeBlockCodec,
MaskedVByteBlockCodec,
OptPForBlockCodec,
QmxBlockCodec,
SimdBpBlockCodec,
Simple16BlockCodec,
Simple8bBlockCodec,
StreamVByteBlockCodec,
VarintG8IUBlockCodec,
VarintGbBlockCodec>;

[[nodiscard]] auto get_block_codec(std::string_view name) -> BlockCodecPtr {
return BlockCodecs::get(name);
}

[[nodiscard]] constexpr auto get_block_codec_names() -> gsl::span<std::string_view const> {
return gsl::make_span<std::string_view const>(&BlockCodecs::names[0], BlockCodecs::count());
}
/**
* Resolves a block codec from a name and returns a shared pointer to the created object.
*
* If the name is not recognized, `nullptr` is returned.
*/
[[nodiscard]] auto get_block_codec(std::string_view name) -> BlockCodecPtr;

/**
* Lists the names of all known block codecs.
*/
[[nodiscard]] constexpr auto get_block_codec_names() -> gsl::span<std::string_view const>;

} // namespace pisa
6 changes: 6 additions & 0 deletions include/pisa/codec/interpolative.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,12 @@

namespace pisa {

/**
* Interpolative coding.
*
* Alistair Moffat, Lang Stuiver: Binary Interpolative Coding for Effective Index Compression. Inf.
* Retr. 3(1): 25-47 (2000)
*/
class InterpolativeBlockCodec: public BlockCodec {
static constexpr std::uint64_t m_block_size = 128;

Expand Down
6 changes: 6 additions & 0 deletions include/pisa/codec/maskedvbyte.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,12 @@

namespace pisa {

/**
* Masked VByte coding.
*
* Jeff Plaisance, Nathan Kurz, Daniel Lemire, Vectorized VByte Decoding, International Symposium on
* Web Algorithms 2015, 2015.
*/
class MaskedVByteBlockCodec: public BlockCodec {
static constexpr std::uint64_t m_block_size = 128;
static constexpr std::uint64_t m_overflow = 512;
Expand Down
40 changes: 9 additions & 31 deletions include/pisa/codec/optpfor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,39 +8,17 @@

namespace pisa {

/**
* OptPForDelta coding.
*
* Hao Yan, Shuai Ding, and Torsten Suel. 2009. Inverted index compression and query processing with
* optimized document ordering. In Proceedings of the 18th international conference on World wide
* web (WWW '09). ACM, New York, NY, USA, 401-410. DOI: https://doi.org/10.1145/1526709.1526764
*/
class OptPForBlockCodec: public BlockCodec {
struct Codec: FastPForLib::OPTPFor<4, FastPForLib::Simple16<false>> {
uint8_t const* force_b{nullptr};

uint32_t findBestB(const uint32_t* in, uint32_t len) {
// trick to force the choice of b from a parameter
if (force_b != nullptr) {
return *force_b;
}

// this is mostly a cut&paste from FastPFor, but we stop the
// optimization early as the b to test becomes larger than maxb
uint32_t b = 0;
uint32_t bsize = std::numeric_limits<uint32_t>::max();
const uint32_t mb = FastPForLib::maxbits(in, in + len);
uint32_t i = 0;
while (mb > 28 + possLogs[i]) {
++i; // some schemes such as Simple16 don't code numbers greater than 28
}

for (; i < possLogs.size(); i++) {
if (possLogs[i] > mb && possLogs[i] >= mb) {
break;
}
const uint32_t csize = tryB(possLogs[i], in, len);

if (csize <= bsize) {
b = possLogs[i];
bsize = csize;
}
}
return b;
}
uint8_t const* force_b = nullptr;
uint32_t findBestB(const uint32_t* in, uint32_t len);
};

static const uint64_t m_block_size = Codec::BlockSize;
Expand Down
8 changes: 8 additions & 0 deletions include/pisa/codec/qmx.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,14 @@

namespace pisa {

/**
* Quantities, Multipliers, and eXtractor (QMX) coding.
*
* Andrew Trotman. 2014. Compression, SIMD, and Postings Lists. In Proceedings of the 2014
* Australasian Document Computing Symposium (ADCS '14), J. Shane Culpepper, Laurence Park, and
* Guido Zuccon (Eds.). ACM, New York, NY, USA, Pages 50, 8 pages. DOI:
* https://doi.org/10.1145/2682862.2682870
*/
class QmxBlockCodec: public BlockCodec {
static constexpr std::uint64_t m_block_size = 128;
static constexpr std::uint64_t m_overflow = 512;
Expand Down
6 changes: 6 additions & 0 deletions include/pisa/codec/simdbp.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,12 @@

namespace pisa {

/**
* SIMD-BP128 coding.
*
* Daniel Lemire, Leonid Boytsov: Decoding billions of integers per second through vectorization.
* Softw., Pract. Exper. 45(1): 1-29 (2015)
*/
class SimdBpBlockCodec: public BlockCodec {
static constexpr std::uint64_t m_block_size = 128;

Expand Down
7 changes: 7 additions & 0 deletions include/pisa/codec/simple16.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,13 @@

namespace pisa {

/**
* Simple16 coding.
*
* Jiangong Zhang, Xiaohui Long, and Torsten Suel. 2008. Performance of compressed inverted list
* caching in search engines. In Proceedings of the 17th international conference on World Wide Web
* (WWW '08). ACM, New York, NY, USA, 387-396. DOI: https://doi.org/10.1145/1367497.1367550
*/
class Simple16BlockCodec: public BlockCodec {
static constexpr std::uint64_t m_block_size = 128;

Expand Down
6 changes: 6 additions & 0 deletions include/pisa/codec/simple8b.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@

namespace pisa {

/**
* Simple8b coding.
*
* Vo Ngoc Anh, Alistair Moffat: Index compression using 64-bit words. Softw., Pract. Exper. 40(2):
* 131-147 (2010)
*/
class Simple8bBlockCodec: public BlockCodec {
static constexpr std::uint64_t m_block_size = 128;

Expand Down
6 changes: 6 additions & 0 deletions include/pisa/codec/streamvbyte.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,12 @@ constexpr std::size_t streamvbyte_max_compressedbytes(std::uint32_t length) {
return cb + db;
}

/**
* StreamVByte coding.
*
* Daniel Lemire, Nathan Kurz, Christoph Rupp: Stream VByte: Faster byte-oriented integer
* compression. Inf. Process. Lett. 130: 1-6 (2018). DOI: https://doi.org/10.1016/j.ipl.2017.09.011
*/
class StreamVByteBlockCodec: public BlockCodec {
static constexpr std::uint64_t m_block_size = 128;
static constexpr std::size_t m_max_compressed_bytes =
Expand Down
9 changes: 9 additions & 0 deletions include/pisa/codec/varint_g8iu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,15 @@

namespace pisa {

/**
* Varint-G8IU coding.
*
* Alexander A. Stepanov, Anil R. Gangolli, Daniel E. Rose, Ryan J. Ernst, and Paramjit S. Oberoi.
* 2011. SIMD-based decoding of posting lists. In Proceedings of the 20th ACM international
* conference on Information and knowledge management (CIKM '11), Bettina Berendt, Arjen de Vries,
* Wenfei Fan, Craig Macdonald, Iadh Ounis, and Ian Ruthven (Eds.). ACM, New York, NY, USA, 317-326.
* DOI: https://doi.org/10.1145/2063576.2063627
*/
class VarintG8IUBlockCodec: public BlockCodec {
static const uint64_t m_block_size = 128;

Expand Down
8 changes: 8 additions & 0 deletions include/pisa/codec/varintgb.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,14 @@ class VarIntGB {
}
};

/**
* VarintGB coding.
*
* Jeffrey Dean. 2009. Challenges in building large-scale information retrieval systems: invited
* talk. In Proceedings of the Second ACM International Conference on Web Search and Data Mining
* (WSDM '09), Ricardo Baeza-Yates, Paolo Boldi, Berthier Ribeiro-Neto, and B. Barla Cambazoglu
* (Eds.). ACM, New York, NY, USA, 1-1. DOI: http://dx.doi.org/10.1145/1498759.1498761
*/
class VarintGbBlockCodec: public BlockCodec {
static constexpr std::uint64_t m_block_size = 128;

Expand Down
2 changes: 0 additions & 2 deletions include/pisa/cursor/scored_cursor.hpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
#pragma once

#include <vector>

#include "concepts.hpp"
#include "concepts/posting_cursor.hpp"
#include "query.hpp"
Expand Down
9 changes: 5 additions & 4 deletions include/pisa/scorer/bm25.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include <cstdint>

#include "index_scorer.hpp"

namespace pisa {

/// Implements the Okapi BM25 model. k1 and b are both free parameters which
Expand All @@ -14,11 +15,11 @@ namespace pisa {
/// in Proceedings of the SIGIR 2012 Workshop on Open Source Information
/// Retrieval (OSIR), 2012.
template <typename Wand>
struct bm25: public index_scorer<Wand> {
using index_scorer<Wand>::index_scorer;
struct bm25: public WandIndexScorer<Wand> {
using WandIndexScorer<Wand>::WandIndexScorer;

bm25(const Wand& wdata, const float b, const float k1)
: index_scorer<Wand>(wdata), m_b(b), m_k1(k1) {}
: WandIndexScorer<Wand>(wdata), m_b(b), m_k1(k1) {}

float doc_term_weight(uint64_t freq, float norm_len) const {
auto f = static_cast<float>(freq);
Expand All @@ -33,7 +34,7 @@ struct bm25: public index_scorer<Wand> {
return std::max(epsilon_score, idf) * (1.0F + m_k1);
}

term_scorer_t term_scorer(uint64_t term_id) const override {
TermScorer term_scorer(uint64_t term_id) const override {
auto term_len = this->m_wdata.term_posting_count(term_id);
auto term_weight = query_term_weight(term_len, this->m_wdata.num_docs());
auto s = [&, term_weight](uint32_t doc, uint32_t freq) {
Expand Down
9 changes: 3 additions & 6 deletions include/pisa/scorer/dph.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,6 @@

#define _USE_MATH_DEFINES

#include <cmath>

#include <algorithm>
#include <cmath>
#include <cstdint>

Expand All @@ -18,10 +15,10 @@ namespace pisa {
/// Vergata at TREC 2007 Blog Track," in Proceedings of the 16th Text REtrieval
/// Conference (TREC), 2007.
template <typename Wand>
struct dph: public index_scorer<Wand> {
using index_scorer<Wand>::index_scorer;
struct dph: public WandIndexScorer<Wand> {
using WandIndexScorer<Wand>::WandIndexScorer;

term_scorer_t term_scorer(uint64_t term_id) const override {
TermScorer term_scorer(uint64_t term_id) const override {
auto s = [&, term_id](uint32_t doc, uint32_t freq) {
float f = (float)freq / this->m_wdata.doc_len(doc);
float norm = (1.F - f) * (1.F - f) / (freq + 1.F);
Expand Down
17 changes: 9 additions & 8 deletions include/pisa/scorer/index_scorer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,26 +5,27 @@

namespace pisa {

using term_scorer_t = std::function<float(uint32_t, uint32_t)>;
using TermScorer = std::function<float(uint32_t, uint32_t)>;

/** Index scorer construct scorers for terms in the index. */
class IndexScorer {
public:
virtual TermScorer term_scorer(std::uint64_t term_id) const = 0;
};

/** Index scorer using WAND metadata for scoring. */
template <typename Wand>
struct index_scorer: IndexScorer {
struct WandIndexScorer: IndexScorer {
protected:
const Wand& m_wdata;

public:
explicit index_scorer(const Wand& wdata) : m_wdata(wdata) {}
index_scorer(index_scorer const&) = default;
index_scorer(index_scorer&&) noexcept = default;
index_scorer& operator=(index_scorer const&) = delete;
index_scorer& operator=(index_scorer&&) noexcept = delete;
virtual ~index_scorer() = default;
explicit WandIndexScorer(const Wand& wdata) : m_wdata(wdata) {}
WandIndexScorer(WandIndexScorer const&) = default;
WandIndexScorer(WandIndexScorer&&) noexcept = default;
WandIndexScorer& operator=(WandIndexScorer const&) = delete;
WandIndexScorer& operator=(WandIndexScorer&&) noexcept = delete;
virtual ~WandIndexScorer() = default;
};

} // namespace pisa
9 changes: 4 additions & 5 deletions include/pisa/scorer/pl2.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

#include <cmath>

#include <algorithm>
#include <cmath>
#include <cstdint>

Expand All @@ -17,12 +16,12 @@ namespace pisa {
/// G. Amati: "Probabalistic models for information retrieval based on
/// divergence from randomness." PhD Thesis, University of Glasgow, 2003.
template <typename Wand>
struct pl2: public index_scorer<Wand> {
using index_scorer<Wand>::index_scorer;
struct pl2: public WandIndexScorer<Wand> {
using WandIndexScorer<Wand>::WandIndexScorer;

pl2(const Wand& wdata, const float c) : index_scorer<Wand>(wdata), m_c(c) {}
pl2(const Wand& wdata, const float c) : WandIndexScorer<Wand>(wdata), m_c(c) {}

term_scorer_t term_scorer(uint64_t term_id) const override {
TermScorer term_scorer(uint64_t term_id) const override {
auto s = [&, term_id](uint32_t doc, uint32_t freq) {
float tfn =
freq * std::log2(1.F + (m_c * this->m_wdata.avg_len()) / this->m_wdata.doc_len(doc));
Expand Down
8 changes: 4 additions & 4 deletions include/pisa/scorer/qld.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,12 @@ namespace pisa {
/// Language Models Applied to Ad Hoc Information Retrieval," in Proceedings of
/// SIGIR, 2001.
template <typename Wand>
struct qld: public index_scorer<Wand> {
using index_scorer<Wand>::index_scorer;
struct qld: public WandIndexScorer<Wand> {
using WandIndexScorer<Wand>::WandIndexScorer;

qld(const Wand& wdata, const float mu) : index_scorer<Wand>(wdata), m_mu(mu) {}
qld(const Wand& wdata, const float mu) : WandIndexScorer<Wand>(wdata), m_mu(mu) {}

term_scorer_t term_scorer(uint64_t term_id) const override {
TermScorer term_scorer(uint64_t term_id) const override {
float mu = this->m_mu;
float collection_len = this->m_wdata.collection_len();
float term_occurrences = this->m_wdata.term_occurrence_count(term_id);
Expand Down
Loading

0 comments on commit 6178ca5

Please sign in to comment.