diff --git a/include/pisa/codec/block_codec.hpp b/include/pisa/codec/block_codec.hpp index 206f7d0e..9a7a8bc7 100644 --- a/include/pisa/codec/block_codec.hpp +++ b/include/pisa/codec/block_codec.hpp @@ -8,7 +8,7 @@ namespace pisa { /** - * Block codecs encode and decode an entire list. This is in opposition to a streaming codec, + * Block codecs encode and decode a list of integers. This is in opposition to a streaming codec, * which can encode and decode values one by one. */ class BlockCodec { @@ -32,10 +32,13 @@ class BlockCodec { * Returns the block size of the encoding. * * Block codecs write blocks of fixed size, e.g., 128 integers. Thus, it is only possible to - * encode at most `block_size()` elements. + * encode at most `block_size()` elements with a single `encode` call. */ [[nodiscard]] virtual auto block_size() const noexcept -> std::size_t = 0; + /** + * Returns the name of the codec. + */ [[nodiscard]] virtual auto get_name() const noexcept -> std::string_view = 0; }; diff --git a/include/pisa/codec/block_codec_registry.hpp b/include/pisa/codec/block_codec_registry.hpp index cbfcff87..e6947db3 100644 --- a/include/pisa/codec/block_codec_registry.hpp +++ b/include/pisa/codec/block_codec_registry.hpp @@ -7,16 +7,6 @@ #include <gsl/span> #include "codec/block_codec.hpp" -#include "codec/interpolative.hpp" -#include "codec/maskedvbyte.hpp" -#include "codec/optpfor.hpp" -#include "codec/qmx.hpp" -#include "codec/simdbp.hpp" -#include "codec/simple16.hpp" -#include "codec/simple8b.hpp" -#include "codec/streamvbyte.hpp" -#include "codec/varint_g8iu.hpp" -#include "codec/varintgb.hpp" namespace pisa { @@ -44,24 +34,16 @@ struct BlockCodecRegistry { } }; -using BlockCodecs = BlockCodecRegistry< - InterpolativeBlockCodec, - MaskedVByteBlockCodec, - OptPForBlockCodec, - QmxBlockCodec, - SimdBpBlockCodec, - Simple16BlockCodec, - Simple8bBlockCodec, - StreamVByteBlockCodec, - VarintG8IUBlockCodec, - VarintGbBlockCodec>; - -[[nodiscard]] auto get_block_codec(std::string_view name) -> BlockCodecPtr { - return BlockCodecs::get(name); -} - -[[nodiscard]] constexpr auto get_block_codec_names() -> gsl::span<std::string_view const> { - return gsl::make_span<std::string_view const>(&BlockCodecs::names[0], BlockCodecs::count()); -} +/** + * Resolves a block codec from a name and returns a shared pointer to the created object. + * + * If the name is not recognized, `nullptr` is returned. + */ +[[nodiscard]] auto get_block_codec(std::string_view name) -> BlockCodecPtr; + +/** + * Lists the names of all known block codecs. + */ +[[nodiscard]] constexpr auto get_block_codec_names() -> gsl::span<std::string_view const>; } // namespace pisa diff --git a/include/pisa/codec/interpolative.hpp b/include/pisa/codec/interpolative.hpp index f01259ac..a362777e 100644 --- a/include/pisa/codec/interpolative.hpp +++ b/include/pisa/codec/interpolative.hpp @@ -7,6 +7,12 @@ namespace pisa { +/** + * Interpolative coding. + * + * Alistair Moffat, Lang Stuiver: Binary Interpolative Coding for Effective Index Compression. Inf. + * Retr. 3(1): 25-47 (2000) + */ class InterpolativeBlockCodec: public BlockCodec { static constexpr std::uint64_t m_block_size = 128; diff --git a/include/pisa/codec/maskedvbyte.hpp b/include/pisa/codec/maskedvbyte.hpp index e2360457..0b324998 100644 --- a/include/pisa/codec/maskedvbyte.hpp +++ b/include/pisa/codec/maskedvbyte.hpp @@ -6,6 +6,12 @@ namespace pisa { +/** + * Masked VByte coding. + * + * Jeff Plaisance, Nathan Kurz, Daniel Lemire, Vectorized VByte Decoding, International Symposium on + * Web Algorithms 2015, 2015. + */ class MaskedVByteBlockCodec: public BlockCodec { static constexpr std::uint64_t m_block_size = 128; static constexpr std::uint64_t m_overflow = 512; diff --git a/include/pisa/codec/optpfor.hpp b/include/pisa/codec/optpfor.hpp index 1ad52223..57790f38 100644 --- a/include/pisa/codec/optpfor.hpp +++ b/include/pisa/codec/optpfor.hpp @@ -8,39 +8,17 @@ namespace pisa { +/** + * OptPForDelta coding. + * + * Hao Yan, Shuai Ding, and Torsten Suel. 2009. Inverted index compression and query processing with + * optimized document ordering. In Proceedings of the 18th international conference on World wide + * web (WWW '09). ACM, New York, NY, USA, 401-410. DOI: https://doi.org/10.1145/1526709.1526764 + */ class OptPForBlockCodec: public BlockCodec { struct Codec: FastPForLib::OPTPFor<4, FastPForLib::Simple16<false>> { - uint8_t const* force_b{nullptr}; - - uint32_t findBestB(const uint32_t* in, uint32_t len) { - // trick to force the choice of b from a parameter - if (force_b != nullptr) { - return *force_b; - } - - // this is mostly a cut&paste from FastPFor, but we stop the - // optimization early as the b to test becomes larger than maxb - uint32_t b = 0; - uint32_t bsize = std::numeric_limits<uint32_t>::max(); - const uint32_t mb = FastPForLib::maxbits(in, in + len); - uint32_t i = 0; - while (mb > 28 + possLogs[i]) { - ++i; // some schemes such as Simple16 don't code numbers greater than 28 - } - - for (; i < possLogs.size(); i++) { - if (possLogs[i] > mb && possLogs[i] >= mb) { - break; - } - const uint32_t csize = tryB(possLogs[i], in, len); - - if (csize <= bsize) { - b = possLogs[i]; - bsize = csize; - } - } - return b; - } + uint8_t const* force_b = nullptr; + uint32_t findBestB(const uint32_t* in, uint32_t len); }; static const uint64_t m_block_size = Codec::BlockSize; diff --git a/include/pisa/codec/qmx.hpp b/include/pisa/codec/qmx.hpp index 17b64626..17e3507a 100644 --- a/include/pisa/codec/qmx.hpp +++ b/include/pisa/codec/qmx.hpp @@ -4,6 +4,14 @@ namespace pisa { +/** + * Quantities, Multipliers, and eXtractor (QMX) coding. + * + * Andrew Trotman. 2014. Compression, SIMD, and Postings Lists. In Proceedings of the 2014 + * Australasian Document Computing Symposium (ADCS '14), J. Shane Culpepper, Laurence Park, and + * Guido Zuccon (Eds.). ACM, New York, NY, USA, Pages 50, 8 pages. DOI: + * https://doi.org/10.1145/2682862.2682870 + */ class QmxBlockCodec: public BlockCodec { static constexpr std::uint64_t m_block_size = 128; static constexpr std::uint64_t m_overflow = 512; diff --git a/include/pisa/codec/simdbp.hpp b/include/pisa/codec/simdbp.hpp index e45a3593..947a3636 100644 --- a/include/pisa/codec/simdbp.hpp +++ b/include/pisa/codec/simdbp.hpp @@ -6,6 +6,12 @@ namespace pisa { +/** + * SIMD-BP128 coding. + * + * Daniel Lemire, Leonid Boytsov: Decoding billions of integers per second through vectorization. + * Softw., Pract. Exper. 45(1): 1-29 (2015) + */ class SimdBpBlockCodec: public BlockCodec { static constexpr std::uint64_t m_block_size = 128; diff --git a/include/pisa/codec/simple16.hpp b/include/pisa/codec/simple16.hpp index 3868408b..9c2fa7c8 100644 --- a/include/pisa/codec/simple16.hpp +++ b/include/pisa/codec/simple16.hpp @@ -4,6 +4,13 @@ namespace pisa { +/** + * Simple16 coding. + * + * Jiangong Zhang, Xiaohui Long, and Torsten Suel. 2008. Performance of compressed inverted list + * caching in search engines. In Proceedings of the 17th international conference on World Wide Web + * (WWW '08). ACM, New York, NY, USA, 387-396. DOI: https://doi.org/10.1145/1367497.1367550 + */ class Simple16BlockCodec: public BlockCodec { static constexpr std::uint64_t m_block_size = 128; diff --git a/include/pisa/codec/simple8b.hpp b/include/pisa/codec/simple8b.hpp index 5ec877da..c3fb995a 100644 --- a/include/pisa/codec/simple8b.hpp +++ b/include/pisa/codec/simple8b.hpp @@ -4,6 +4,12 @@ namespace pisa { +/** + * Simple8b coding. + * + * Vo Ngoc Anh, Alistair Moffat: Index compression using 64-bit words. Softw., Pract. Exper. 40(2): + * 131-147 (2010) + */ class Simple8bBlockCodec: public BlockCodec { static constexpr std::uint64_t m_block_size = 128; diff --git a/include/pisa/codec/streamvbyte.hpp b/include/pisa/codec/streamvbyte.hpp index 3ec96437..93031460 100644 --- a/include/pisa/codec/streamvbyte.hpp +++ b/include/pisa/codec/streamvbyte.hpp @@ -18,6 +18,12 @@ constexpr std::size_t streamvbyte_max_compressedbytes(std::uint32_t length) { return cb + db; } +/** + * StreamVByte coding. + * + * Daniel Lemire, Nathan Kurz, Christoph Rupp: Stream VByte: Faster byte-oriented integer + * compression. Inf. Process. Lett. 130: 1-6 (2018). DOI: https://doi.org/10.1016/j.ipl.2017.09.011 + */ class StreamVByteBlockCodec: public BlockCodec { static constexpr std::uint64_t m_block_size = 128; static constexpr std::size_t m_max_compressed_bytes = diff --git a/include/pisa/codec/varint_g8iu.hpp b/include/pisa/codec/varint_g8iu.hpp index d84b8500..8cc651d3 100644 --- a/include/pisa/codec/varint_g8iu.hpp +++ b/include/pisa/codec/varint_g8iu.hpp @@ -7,6 +7,15 @@ namespace pisa { +/** + * Varint-G8IU coding. + * + * Alexander A. Stepanov, Anil R. Gangolli, Daniel E. Rose, Ryan J. Ernst, and Paramjit S. Oberoi. + * 2011. SIMD-based decoding of posting lists. In Proceedings of the 20th ACM international + * conference on Information and knowledge management (CIKM '11), Bettina Berendt, Arjen de Vries, + * Wenfei Fan, Craig Macdonald, Iadh Ounis, and Ian Ruthven (Eds.). ACM, New York, NY, USA, 317-326. + * DOI: https://doi.org/10.1145/2063576.2063627 + */ class VarintG8IUBlockCodec: public BlockCodec { static const uint64_t m_block_size = 128; diff --git a/include/pisa/codec/varintgb.hpp b/include/pisa/codec/varintgb.hpp index 1a2437fc..2c9e766f 100644 --- a/include/pisa/codec/varintgb.hpp +++ b/include/pisa/codec/varintgb.hpp @@ -228,6 +228,14 @@ class VarIntGB { } }; +/** + * VarintGB coding. + * + * Jeffrey Dean. 2009. Challenges in building large-scale information retrieval systems: invited + * talk. In Proceedings of the Second ACM International Conference on Web Search and Data Mining + * (WSDM '09), Ricardo Baeza-Yates, Paolo Boldi, Berthier Ribeiro-Neto, and B. Barla Cambazoglu + * (Eds.). ACM, New York, NY, USA, 1-1. DOI: http://dx.doi.org/10.1145/1498759.1498761 + */ class VarintGbBlockCodec: public BlockCodec { static constexpr std::uint64_t m_block_size = 128; diff --git a/include/pisa/cursor/scored_cursor.hpp b/include/pisa/cursor/scored_cursor.hpp index c7250109..990a520a 100644 --- a/include/pisa/cursor/scored_cursor.hpp +++ b/include/pisa/cursor/scored_cursor.hpp @@ -1,7 +1,5 @@ #pragma once -#include <vector> - #include "concepts.hpp" #include "concepts/posting_cursor.hpp" #include "query.hpp" diff --git a/include/pisa/scorer/bm25.hpp b/include/pisa/scorer/bm25.hpp index 7e586d3c..35bd8700 100644 --- a/include/pisa/scorer/bm25.hpp +++ b/include/pisa/scorer/bm25.hpp @@ -5,6 +5,7 @@ #include <cstdint> #include "index_scorer.hpp" + namespace pisa { /// Implements the Okapi BM25 model. k1 and b are both free parameters which @@ -14,11 +15,11 @@ namespace pisa { /// in Proceedings of the SIGIR 2012 Workshop on Open Source Information /// Retrieval (OSIR), 2012. template <typename Wand> -struct bm25: public index_scorer<Wand> { - using index_scorer<Wand>::index_scorer; +struct bm25: public WandIndexScorer<Wand> { + using WandIndexScorer<Wand>::WandIndexScorer; bm25(const Wand& wdata, const float b, const float k1) - : index_scorer<Wand>(wdata), m_b(b), m_k1(k1) {} + : WandIndexScorer<Wand>(wdata), m_b(b), m_k1(k1) {} float doc_term_weight(uint64_t freq, float norm_len) const { auto f = static_cast<float>(freq); @@ -33,7 +34,7 @@ struct bm25: public index_scorer<Wand> { return std::max(epsilon_score, idf) * (1.0F + m_k1); } - term_scorer_t term_scorer(uint64_t term_id) const override { + TermScorer term_scorer(uint64_t term_id) const override { auto term_len = this->m_wdata.term_posting_count(term_id); auto term_weight = query_term_weight(term_len, this->m_wdata.num_docs()); auto s = [&, term_weight](uint32_t doc, uint32_t freq) { diff --git a/include/pisa/scorer/dph.hpp b/include/pisa/scorer/dph.hpp index 1db508b1..22731f6a 100644 --- a/include/pisa/scorer/dph.hpp +++ b/include/pisa/scorer/dph.hpp @@ -2,9 +2,6 @@ #define _USE_MATH_DEFINES -#include <cmath> - -#include <algorithm> #include <cmath> #include <cstdint> @@ -18,10 +15,10 @@ namespace pisa { /// Vergata at TREC 2007 Blog Track," in Proceedings of the 16th Text REtrieval /// Conference (TREC), 2007. template <typename Wand> -struct dph: public index_scorer<Wand> { - using index_scorer<Wand>::index_scorer; +struct dph: public WandIndexScorer<Wand> { + using WandIndexScorer<Wand>::WandIndexScorer; - term_scorer_t term_scorer(uint64_t term_id) const override { + TermScorer term_scorer(uint64_t term_id) const override { auto s = [&, term_id](uint32_t doc, uint32_t freq) { float f = (float)freq / this->m_wdata.doc_len(doc); float norm = (1.F - f) * (1.F - f) / (freq + 1.F); diff --git a/include/pisa/scorer/index_scorer.hpp b/include/pisa/scorer/index_scorer.hpp index 93b08891..a5adaa44 100644 --- a/include/pisa/scorer/index_scorer.hpp +++ b/include/pisa/scorer/index_scorer.hpp @@ -5,26 +5,27 @@ namespace pisa { -using term_scorer_t = std::function<float(uint32_t, uint32_t)>; using TermScorer = std::function<float(uint32_t, uint32_t)>; +/** Index scorer construct scorers for terms in the index. */ class IndexScorer { public: virtual TermScorer term_scorer(std::uint64_t term_id) const = 0; }; +/** Index scorer using WAND metadata for scoring. */ template <typename Wand> -struct index_scorer: IndexScorer { +struct WandIndexScorer: IndexScorer { protected: const Wand& m_wdata; public: - explicit index_scorer(const Wand& wdata) : m_wdata(wdata) {} - index_scorer(index_scorer const&) = default; - index_scorer(index_scorer&&) noexcept = default; - index_scorer& operator=(index_scorer const&) = delete; - index_scorer& operator=(index_scorer&&) noexcept = delete; - virtual ~index_scorer() = default; + explicit WandIndexScorer(const Wand& wdata) : m_wdata(wdata) {} + WandIndexScorer(WandIndexScorer const&) = default; + WandIndexScorer(WandIndexScorer&&) noexcept = default; + WandIndexScorer& operator=(WandIndexScorer const&) = delete; + WandIndexScorer& operator=(WandIndexScorer&&) noexcept = delete; + virtual ~WandIndexScorer() = default; }; } // namespace pisa diff --git a/include/pisa/scorer/pl2.hpp b/include/pisa/scorer/pl2.hpp index 91865be6..27c8e230 100644 --- a/include/pisa/scorer/pl2.hpp +++ b/include/pisa/scorer/pl2.hpp @@ -4,7 +4,6 @@ #include <cmath> -#include <algorithm> #include <cmath> #include <cstdint> @@ -17,12 +16,12 @@ namespace pisa { /// G. Amati: "Probabalistic models for information retrieval based on /// divergence from randomness." PhD Thesis, University of Glasgow, 2003. template <typename Wand> -struct pl2: public index_scorer<Wand> { - using index_scorer<Wand>::index_scorer; +struct pl2: public WandIndexScorer<Wand> { + using WandIndexScorer<Wand>::WandIndexScorer; - pl2(const Wand& wdata, const float c) : index_scorer<Wand>(wdata), m_c(c) {} + pl2(const Wand& wdata, const float c) : WandIndexScorer<Wand>(wdata), m_c(c) {} - term_scorer_t term_scorer(uint64_t term_id) const override { + TermScorer term_scorer(uint64_t term_id) const override { auto s = [&, term_id](uint32_t doc, uint32_t freq) { float tfn = freq * std::log2(1.F + (m_c * this->m_wdata.avg_len()) / this->m_wdata.doc_len(doc)); diff --git a/include/pisa/scorer/qld.hpp b/include/pisa/scorer/qld.hpp index dc3ea19b..88c6529d 100644 --- a/include/pisa/scorer/qld.hpp +++ b/include/pisa/scorer/qld.hpp @@ -17,12 +17,12 @@ namespace pisa { /// Language Models Applied to Ad Hoc Information Retrieval," in Proceedings of /// SIGIR, 2001. template <typename Wand> -struct qld: public index_scorer<Wand> { - using index_scorer<Wand>::index_scorer; +struct qld: public WandIndexScorer<Wand> { + using WandIndexScorer<Wand>::WandIndexScorer; - qld(const Wand& wdata, const float mu) : index_scorer<Wand>(wdata), m_mu(mu) {} + qld(const Wand& wdata, const float mu) : WandIndexScorer<Wand>(wdata), m_mu(mu) {} - term_scorer_t term_scorer(uint64_t term_id) const override { + TermScorer term_scorer(uint64_t term_id) const override { float mu = this->m_mu; float collection_len = this->m_wdata.collection_len(); float term_occurrences = this->m_wdata.term_occurrence_count(term_id); diff --git a/include/pisa/scorer/quantized.hpp b/include/pisa/scorer/quantized.hpp index 9cca61e7..12fda286 100644 --- a/include/pisa/scorer/quantized.hpp +++ b/include/pisa/scorer/quantized.hpp @@ -1,6 +1,5 @@ #pragma once -#include <algorithm> #include <cassert> #include <cstdint> #include <utility> @@ -11,10 +10,10 @@ namespace pisa { template <typename Wand> -struct quantized: public index_scorer<Wand> { - using index_scorer<Wand>::index_scorer; +struct quantized: public WandIndexScorer<Wand> { + using WandIndexScorer<Wand>::WandIndexScorer; - term_scorer_t term_scorer([[maybe_unused]] uint64_t term_id) const { + TermScorer term_scorer([[maybe_unused]] uint64_t term_id) const { return []([[maybe_unused]] uint32_t doc, uint32_t freq) { return freq; }; } }; diff --git a/include/pisa/scorer/scorer.hpp b/include/pisa/scorer/scorer.hpp index 2ab63016..38e04bdd 100644 --- a/include/pisa/scorer/scorer.hpp +++ b/include/pisa/scorer/scorer.hpp @@ -23,7 +23,7 @@ struct ScorerParams { namespace pisa { namespace scorer { inline auto from_params = [](const ScorerParams& params, auto const& wdata - ) -> std::unique_ptr<index_scorer<std::decay_t<decltype(wdata)>>> { + ) -> std::unique_ptr<WandIndexScorer<std::decay_t<decltype(wdata)>>> { if (params.name == "bm25") { return std::make_unique<bm25<std::decay_t<decltype(wdata)>>>( wdata, params.bm25_b, params.bm25_k1 diff --git a/include/pisa/util/verify_collection.hpp b/include/pisa/util/verify_collection.hpp index 98597289..0fa69176 100644 --- a/include/pisa/util/verify_collection.hpp +++ b/include/pisa/util/verify_collection.hpp @@ -72,51 +72,7 @@ void verify_collection( Collection coll; auto source = MemorySource::mapped_file(std::filesystem::path(filename)); pisa::mapper::map(coll, source.data()); - size_t size = 0; - spdlog::info("Checking the written data, just to be extra safe..."); - size_t s = 0; - for (auto seq: input) { - size = seq.docs.size(); - auto e = coll[s]; - if (e.size() != size) { - spdlog::error("sequence {} has wrong length! ({} != {})", s, e.size(), size); - throw std::runtime_error("oops"); - } - auto term_scorer = quantizing_scorer.has_value() - ? std::make_optional<std::function<std::uint32_t(std::uint32_t, std::uint32_t)>>( - quantizing_scorer->term_scorer(s) - ) - : std::nullopt; - for (size_t i = 0; i < e.size(); ++i, e.next()) { - uint64_t docid = *(seq.docs.begin() + i); - uint64_t freq = *(seq.freqs.begin() + i); - - if (docid != e.docid()) { - spdlog::error("docid in sequence {} differs at position {}!", s, i); - spdlog::error("{} != {}", e.docid(), docid); - spdlog::error("sequence length: {}", seq.docs.size()); - throw std::runtime_error("oops"); - } - - if (!term_scorer.has_value() && freq != e.freq()) { - spdlog::error("freq in sequence {} differs at position {}!", s, i); - spdlog::error("{} != {}", e.freq(), freq); - spdlog::error("sequence length: {}", seq.docs.size()); - throw std::runtime_error("oops"); - } - - if (term_scorer.has_value()) { - if ((*term_scorer)(docid, freq) != e.freq()) { - spdlog::error("quantized score in sequence {} differs at position {}!", s, i); - spdlog::error("{} != {}", e.freq(), (*term_scorer)(docid, freq)); - spdlog::error("sequence length: {}", seq.docs.size()); - throw std::runtime_error("oops"); - } - } - } - s += 1; - } - spdlog::info("Everything is OK!"); + verify_collection(input, coll, std::move(quantizing_scorer)); } } // namespace pisa diff --git a/src/block_inverted_index.cpp b/src/block_inverted_index.cpp index 24b02c00..27203968 100644 --- a/src/block_inverted_index.cpp +++ b/src/block_inverted_index.cpp @@ -192,8 +192,6 @@ void BlockIndexBuilder::build(binary_freq_collection const& input, std::string c MemorySource::mapped_file(std::filesystem::path(index_path)), m_block_codec ); dump_stats(index.size_stats(), postings); - // TODO: only pefopt - // dump_index_specific_stats(coll, seq_type); verify_collection<binary_freq_collection, BlockInvertedIndex>( input, index, std::move(m_quantizing_scorer) ); diff --git a/src/codec/block_codec_registry.cpp b/src/codec/block_codec_registry.cpp index ab7a880a..292a23e5 100644 --- a/src/codec/block_codec_registry.cpp +++ b/src/codec/block_codec_registry.cpp @@ -1,8 +1,6 @@ #include "codec/block_codec_registry.hpp" -#include <algorithm> #include <array> -#include <memory> #include <string_view> #include <fmt/format.h> @@ -22,4 +20,24 @@ namespace pisa { +using BlockCodecs = BlockCodecRegistry< + InterpolativeBlockCodec, + MaskedVByteBlockCodec, + OptPForBlockCodec, + QmxBlockCodec, + SimdBpBlockCodec, + Simple16BlockCodec, + Simple8bBlockCodec, + StreamVByteBlockCodec, + VarintG8IUBlockCodec, + VarintGbBlockCodec>; + +auto get_block_codec(std::string_view name) -> BlockCodecPtr { + return BlockCodecs::get(name); +} + +constexpr auto get_block_codec_names() -> gsl::span<std::string_view const> { + return gsl::make_span<std::string_view const>(&BlockCodecs::names[0], BlockCodecs::count()); +} + } // namespace pisa diff --git a/src/codec/optpfor.cpp b/src/codec/optpfor.cpp index 445fa633..b56e17c1 100644 --- a/src/codec/optpfor.cpp +++ b/src/codec/optpfor.cpp @@ -5,6 +5,36 @@ namespace pisa { +uint32_t OptPForBlockCodec::Codec::findBestB(const uint32_t* in, uint32_t len) { + // trick to force the choice of b from a parameter + if (force_b != nullptr) { + return *force_b; + } + + // this is mostly a cut&paste from FastPFor, but we stop the + // optimization early as the b to test becomes larger than maxb + uint32_t b = 0; + uint32_t bsize = std::numeric_limits<uint32_t>::max(); + const uint32_t mb = FastPForLib::maxbits(in, in + len); + uint32_t i = 0; + while (mb > 28 + possLogs[i]) { + ++i; // some schemes such as Simple16 don't code numbers greater than 28 + } + + for (; i < possLogs.size(); i++) { + if (possLogs[i] > mb && possLogs[i] >= mb) { + break; + } + const uint32_t csize = tryB(possLogs[i], in, len); + + if (csize <= bsize) { + b = possLogs[i]; + bsize = csize; + } + } + return b; +} + void OptPForBlockCodec::encode( uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector<uint8_t>& out ) const { diff --git a/test/docker/benchmark/Dockerfile b/test/docker/benchmark/Dockerfile index 1b8d6ed0..bc095221 100644 --- a/test/docker/benchmark/Dockerfile +++ b/test/docker/benchmark/Dockerfile @@ -14,7 +14,7 @@ RUN ./pisa/test/docker/install-cmake.sh RUN mkdir /pisa/build WORKDIR /pisa/build RUN cmake \ - "-DCMAKE_BUILD_TYPE=Debug" \ + "-DCMAKE_BUILD_TYPE=Release" \ "-DPISA_BUILD_TOOLS=ON" \ "-DPISA_ENABLE_TESTING=OFF" \ "-DPISA_ENABLE_BENCHMARKING=OFF" \ @@ -29,5 +29,6 @@ COPY ./test/docker/benchmark/evaluate.sh /pisa/build COPY ./test/docker/benchmark/bench.sh /pisa/build COPY ./test/docker/benchmark/setup.sh /pisa/build COPY ./test/docker/benchmark/expected-eval.txt /pisa/build +COPY ./test/docker/benchmark/encodings.sh /pisa/build CMD ["bash", "/pisa/test/docker/benchmark/run.sh"] diff --git a/test/docker/benchmark/bench.sh b/test/docker/benchmark/bench.sh index afaa9dbc..354e161c 100755 --- a/test/docker/benchmark/bench.sh +++ b/test/docker/benchmark/bench.sh @@ -2,13 +2,17 @@ set -e -./bin/queries \ - -e block_simdbp \ - -a block_max_wand \ - -i "$WORKDIR/inv.block_simdbp" \ - -w "$WORKDIR/inv.bm25.bmw" \ - -F lowercase -F porter2 \ - --terms "$WORKDIR/fwd.termlex" \ - -k 1000 \ - --scorer bm25 \ - -q "$WORKDIR/topics.robust2004.title" +source ./encodings.sh + +for encoding in ${ENCODINGS[@]}; do + ./bin/queries \ + -e "$encoding" \ + -a block_max_wand \ + -i "$WORKDIR/inv.$encoding" \ + -w "$WORKDIR/inv.bm25.bmw" \ + -F lowercase -F porter2 \ + --terms "$WORKDIR/fwd.termlex" \ + -k 1000 \ + --scorer bm25 \ + -q "$WORKDIR/topics.robust2004.title" +done diff --git a/test/docker/benchmark/build.sh b/test/docker/benchmark/build.sh index 32c65560..14ee6c82 100755 --- a/test/docker/benchmark/build.sh +++ b/test/docker/benchmark/build.sh @@ -2,6 +2,8 @@ set -e +source ./encodings.sh + gzip -dc $(find "$COLLECTION_PATH" -type f -name '*.*z' \ \( -path '*/disk4/fr94/[0-9]*/*' -o -path '*/disk4/ft/ft*' \ -o -path '*/disk5/fbis/fb*' -o -path '*/disk5/latimes/la*' \)) \ @@ -25,20 +27,7 @@ gzip -dc $(find "$COLLECTION_PATH" -type f -name '*.*z' \ -o "$WORKDIR/inv.bm25.bmw" \ -s bm25 -encodings=( - block_interpolative - block_maskedvbyte - block_optpfor - block_qmx - block_simdbp - block_simple16 - block_simple8b - block_streamvbyte - block_varintg8iu - block_varintgb -) - -for encoding in ${encodings[@]}; do +for encoding in ${ENCODINGS[@]}; do ./bin/compress_inverted_index \ -e "$encoding" \ -c "$WORKDIR/inv.bp" \ diff --git a/test/docker/benchmark/encodings.sh b/test/docker/benchmark/encodings.sh new file mode 100644 index 00000000..6a20b96d --- /dev/null +++ b/test/docker/benchmark/encodings.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +ENCODINGS=( + block_interpolative + block_maskedvbyte + block_optpfor + block_qmx + block_simdbp + block_simple16 + block_simple8b + block_streamvbyte + block_varintg8iu + block_varintgb + ef + pefopt + pefuniform + single +) diff --git a/test/docker/benchmark/evaluate.sh b/test/docker/benchmark/evaluate.sh index bb39576e..dd676ec9 100755 --- a/test/docker/benchmark/evaluate.sh +++ b/test/docker/benchmark/evaluate.sh @@ -2,21 +2,23 @@ set -e -./bin/evaluate_queries \ - -e block_simdbp \ - -a block_max_wand \ - -i "$WORKDIR/inv.block_simdbp" \ - -w "$WORKDIR/inv.bm25.bmw" \ - -F lowercase -F porter2 \ - --terms "$WORKDIR/fwd.termlex" \ - --documents "$WORKDIR/fwd.bp.doclex" \ - -k 1000 \ - --scorer bm25 \ - -q "$WORKDIR/topics.robust2004.title" \ - > "$WORKDIR/results.txt" +source ./encodings.sh -trec_eval -m map -m P.30 -m ndcg_cut.20 "$WORKDIR/qrels.robust2004.txt" "$WORKDIR/results.txt" > 'eval.txt' +for encoding in ${ENCODINGS[@]}; do + ./bin/evaluate_queries \ + -e "$encoding" \ + -a block_max_wand \ + -i "$WORKDIR/inv.$encoding" \ + -w "$WORKDIR/inv.bm25.bmw" \ + -F lowercase -F porter2 \ + --terms "$WORKDIR/fwd.termlex" \ + --documents "$WORKDIR/fwd.bp.doclex" \ + -k 1000 \ + --scorer bm25 \ + -q "$WORKDIR/topics.robust2004.title" \ + > "$WORKDIR/results.txt" -cat 'eval.txt' - -diff 'eval.txt' expected-eval.txt + trec_eval -m map -m P.30 -m ndcg_cut.20 "$WORKDIR/qrels.robust2004.txt" "$WORKDIR/results.txt" > 'eval.txt' + cat 'eval.txt' + diff 'eval.txt' expected-eval.txt +done