Skip to content

Commit

Permalink
Add option to perform index compression in memory
Browse files Browse the repository at this point in the history
  • Loading branch information
Agustín González committed Apr 13, 2024
1 parent ce97f7b commit f8b6308
Show file tree
Hide file tree
Showing 6 changed files with 42 additions and 27 deletions.
3 changes: 2 additions & 1 deletion include/pisa/compress.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ void compress(
std::string const& output_filename,
ScorerParams const& scorer_params,
std::optional<Size> quantization_bits,
bool check
bool check,
bool in_memory
);

} // namespace pisa
51 changes: 29 additions & 22 deletions src/compress.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,32 +113,38 @@ void compress_index(
std::string const& seq_type,
std::optional<std::string> const& wand_data_filename,
ScorerParams const& scorer_params,
std::optional<Size> quantization_bits
std::optional<Size> quantization_bits,
bool in_memory
) {
std::optional<QuantizingScorer<WandType>> quantizing_scorer{};
if constexpr (std::is_same_v<typename CollectionType::index_layout_tag, BlockIndexTag>) {
WandType wdata;
mio::mmap_source wdata_source;
if (quantization_bits.has_value()) {
ensure(wand_data_filename.has_value())
.or_panic("Bug: Asked for quantized but no wand data");
std::error_code error;
wdata_source.map(*wand_data_filename, error);
if (error) {
spdlog::error("error mapping file: {}, exiting...", error.message());
std::abort();

// Performs the compression using an intermediate buffer.
if (!in_memory) {
if constexpr (std::is_same_v<typename CollectionType::index_layout_tag, BlockIndexTag>) {
WandType wdata;
mio::mmap_source wdata_source;
if (quantization_bits.has_value()) {
ensure(wand_data_filename.has_value())
.or_panic("Bug: Asked for quantized but no wand data");
std::error_code error;
wdata_source.map(*wand_data_filename, error);
if (error) {
spdlog::error("error mapping file: {}, exiting...", error.message());
std::abort();
}
mapper::map(wdata, wdata_source, mapper::map_flags::warmup);
auto scorer = scorer::from_params(scorer_params, wdata);
LinearQuantizer quantizer(wdata.index_max_term_weight(), quantization_bits->as_int());
quantizing_scorer = QuantizingScorer(std::move(scorer), quantizer);
}
mapper::map(wdata, wdata_source, mapper::map_flags::warmup);
auto scorer = scorer::from_params(scorer_params, wdata);
LinearQuantizer quantizer(wdata.index_max_term_weight(), quantization_bits->as_int());
quantizing_scorer = QuantizingScorer(std::move(scorer), quantizer);
compress_index_streaming<CollectionType, WandType>(
input, params, *output_filename, std::move(quantizing_scorer), check
);
return;
}
compress_index_streaming<CollectionType, WandType>(
input, params, *output_filename, std::move(quantizing_scorer), check
);
return;
}

// Performs the compression in memory.
spdlog::info("Processing {} documents", input.num_docs());
double tick = get_time_usecs();

Expand Down Expand Up @@ -217,7 +223,8 @@ void compress(
std::string const& output_filename,
ScorerParams const& scorer_params,
std::optional<Size> quantization_bits,
bool check
bool check,
bool in_memory
) {
binary_freq_collection input(input_basename.c_str());
global_parameters params;
Expand All @@ -227,7 +234,7 @@ void compress(
} \
else if (index_encoding == BOOST_PP_STRINGIZE(T)) { \
compress_index<pisa::BOOST_PP_CAT(T, _index), wand_data<wand_data_raw>>( \
input, params, output_filename, check, index_encoding, wand_data_filename, scorer_params, quantization_bits \
input, params, output_filename, check, index_encoding, wand_data_filename, scorer_params, quantization_bits, in_memory \
); \
/**/
BOOST_PP_SEQ_FOR_EACH(LOOP_BODY, _, PISA_INDEX_TYPES);
Expand Down
6 changes: 4 additions & 2 deletions test/test_compress.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ TEST_CASE("Compress index", "[index][compress]") {
(tmp.path() / encoding).string(),
ScorerParams(""), // no scorer
std::nullopt, // no quantization
true // check=true
true, // check=true
false // in-memory=false
);
}

Expand Down Expand Up @@ -82,6 +83,7 @@ TEST_CASE("Compress quantized index", "[index][compress]") {
(tmp.path() / encoding).string(),
scorer_params,
pisa::Size(8),
true // check=true
true, // check=true,
false // in-memory=true
);
}
3 changes: 3 additions & 0 deletions tools/app.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -270,11 +270,13 @@ namespace arg {
->required();
app->add_option("-o,--output", m_output, "Output inverted index")->required();
app->add_flag("--check", m_check, "Check the correctness of the index");
app->add_flag("--in-memory", m_in_memory, "Compress the index in memory, without using an intermediate buffer");
}

[[nodiscard]] auto input_basename() const -> std::string { return m_input_basename; }
[[nodiscard]] auto output() const -> std::string { return m_output; }
[[nodiscard]] auto check() const -> bool { return m_check; }
[[nodiscard]] auto in_memory() const -> bool { return m_in_memory; }

/// Transform paths for `shard`.
void apply_shard(Shard_Id shard) {
Expand All @@ -286,6 +288,7 @@ namespace arg {
std::string m_input_basename{};
std::string m_output{};
bool m_check = false;
bool m_in_memory = false;
};

struct CreateWandData {
Expand Down
3 changes: 2 additions & 1 deletion tools/compress_inverted_index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ int main(int argc, char** argv) {
args.output(),
args.scorer_params(),
args.quantization_bits(),
args.check()
args.check(),
args.in_memory()
);
}
3 changes: 2 additions & 1 deletion tools/shards.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,8 @@ int main(int argc, char** argv) {
shard_args.output(),
shard_args.scorer_params(),
shard_args.quantization_bits(),
shard_args.check()
shard_args.check(),
shard_args.in_memory()
);
}
return 0;
Expand Down

0 comments on commit f8b6308

Please sign in to comment.