Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MaxScore TAAT and blocked accumulators #14

Open
wants to merge 39 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 22 commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
653d21e
First take at exhaustive TAAT
elshize Dec 20, 2018
8f0b54a
Fetch an entire block at a time for TAAT
elshize Dec 22, 2018
9f4e325
Add prefetching for TAAT
elshize Dec 22, 2018
c28e5d2
Return buffer references from posting lists instead of moved buffers.
elshize Dec 22, 2018
5d5519b
Blocked accumulator array for TAAT
elshize Dec 22, 2018
886e580
TAAT maxscore and lazy accumulator
elshize Dec 27, 2018
b0e2824
TAAT MaxScore and Blocked Accumulator
elshize Dec 28, 2018
7737d1f
Remove heap stuff
elshize Dec 28, 2018
0664cba
TAAT optimizations
elshize Dec 29, 2018
9ae6b1a
Vectorize lookup traversal.
elshize Dec 29, 2018
26a05f1
Simple but effective ranked_or with taat
amallia Jan 15, 2019
41ca198
Remove OpenMP
elshize Jan 15, 2019
200be05
Remove OpenMP
elshize Jan 15, 2019
c3400d1
Use template rather than std::function for faster processing
elshize Jan 15, 2019
87252f3
Lazy accumulator fixed
elshize Jan 15, 2019
b9dd1fd
Fix block traversal issue
elshize Jan 16, 2019
5c539b4
Removed ds2i namespace
amallia Jan 16, 2019
0c36632
Moved algos
amallia Jan 16, 2019
6e96d16
Merge with master
amallia Jan 16, 2019
f2c93f9
code cleanup
amallia Jan 16, 2019
7ca53d5
Added comment [skip ci]
amallia Jan 16, 2019
30782ac
Improved queries interface
amallia Jan 16, 2019
3649a7d
removed unused lambdas
amallia Jan 16, 2019
81db028
Removed buffers
amallia Jan 16, 2019
09fbc02
Merge branch 'master' into taat
elshize Jan 17, 2019
f1a8b64
Simplified Maxscore
amallia Jan 19, 2019
eeacec4
Merge branch 'master' into taat
amallia Jan 21, 2019
d08a063
Delete exhaustive_taat_query.hpp
amallia Jan 21, 2019
f254277
Update test_ranked_queries.cpp
amallia Jan 21, 2019
b92e93d
Update test_ranked_queries.cpp
amallia Jan 21, 2019
c8c1b35
Faster MaxScore
amallia Jan 22, 2019
e7cd360
Merge branch 'master' into taat
amallia Jan 22, 2019
e60142b
Added extra check
amallia Jan 22, 2019
a1d25d4
Merge branch 'master' into taat
elshize Jan 25, 2019
2e12eff
Merge remote-tracking branch 'origin/master' into taat
elshize Jan 25, 2019
18be6a0
Use int for 8 bits
elshize Jan 25, 2019
48a0f80
Bug fix: reset counter
elshize Jan 26, 2019
abf75fc
Merge branch 'master' into taat
amallia Jan 27, 2019
cb67ba9
Merge branch 'master' into taat
amallia Jan 29, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ list(APPEND LCOV_REMOVE_PATTERNS "'${PROJECT_SOURCE_DIR}/external/*'")


if (UNIX)

# For hardware popcount and other special instructions
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")

Expand All @@ -62,8 +61,6 @@ endif()

set(THREADS_PREFER_PTHREAD_FLAG ON)
find_package(Threads REQUIRED)
link_libraries(Threads::Threads)


include_directories(include)
add_library(pisa INTERFACE)
Expand Down
84 changes: 84 additions & 0 deletions include/pisa/accumulator/blocked_accumulator.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#pragma once

namespace pisa {

template <int block_size>
struct Blocked_Accumulator {

struct Proxy_Element {
std::ptrdiff_t document;
std::vector<float> &accumulators;
std::vector<float> &accumulators_max;

Proxy_Element &operator=(float score) {
accumulators[document] = score;
auto &block_max = accumulators_max[document / block_size];
if (score > block_max) {
block_max = score;
}
return *this;
}
Proxy_Element &operator+=(float delta) {
accumulators[document] += delta;
auto const&score = accumulators[document];
auto &block_max = accumulators_max[document / block_size];
if (score > block_max) {
block_max = score;
}
return *this;
}

operator float() { return accumulators[document]; }
};

using reference = Proxy_Element;

static_assert(block_size > 0, "must be positive");

[[nodiscard]] constexpr static auto calc_block_count(std::size_t size) noexcept -> std::size_t {
return (size + block_size - 1) / block_size;
}

Blocked_Accumulator(std::size_t size)
: m_size(size),
m_block_count(calc_block_count(size)), m_accumulators(size),
m_accumulators_max(m_block_count) {}

void init() { std::fill(m_accumulators.begin(), m_accumulators.end(), 0.0); }

[[nodiscard]] auto operator[](std::ptrdiff_t document) -> Proxy_Element
{
return {document, m_accumulators, m_accumulators_max};
}

void accumulate(std::ptrdiff_t const document, float score_delta)
{
m_accumulators[document] += score_delta;
auto const &score = m_accumulators[document];
auto &block_max = m_accumulators_max[document / block_size];
if (score > block_max) {
block_max = score;
}
}

void aggregate(topk_queue &topk) {
for (size_t block = 0; block < m_block_count; ++block) {
if (not topk.would_enter(m_accumulators_max[block])) { continue; }
uint32_t doc = block * block_size;
uint32_t end = std::min((block + 1) * block_size, m_accumulators.size());
for (; doc < end; ++doc) {
topk.insert(m_accumulators[doc], doc);
}
}
}

[[nodiscard]] auto size() noexcept -> std::size_t { return m_size; }

private:
std::size_t m_size;
std::size_t m_block_count;
std::vector<float> m_accumulators;
std::vector<float> m_accumulators_max;
};

} // pisa
95 changes: 95 additions & 0 deletions include/pisa/accumulator/lazy_accumulator.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
#pragma once

namespace pisa {

template <int counter_bit_size, typename Descriptor = std::uint64_t>
struct Lazy_Accumulator {
using reference = float &;

static_assert(std::is_integral_v<Descriptor> && std::is_unsigned_v<Descriptor>,
"must be unsigned number");
constexpr static auto descriptor_size_in_bits = sizeof(Descriptor) * 8;
constexpr static auto counters_in_descriptor = descriptor_size_in_bits / counter_bit_size;
constexpr static auto cycle = (1u << counter_bit_size);
constexpr static Descriptor mask = (1u << counter_bit_size) - 1;

struct Block {
Descriptor descriptor{};
std::array<float, counters_in_descriptor> accumulators{};

[[nodiscard]] auto counter(int pos) const noexcept -> int {
return (descriptor >> (pos * counter_bit_size)) & mask;
}

void reset_counter(int pos, int counter)
{
auto const shift = pos * counter_bit_size;
descriptor &= ~(mask << shift);
descriptor |= static_cast<Descriptor>(counter) << shift;
accumulators[pos] = 0;
}
};

Lazy_Accumulator(std::size_t size)
: m_size(size), m_accumulators((size + counters_in_descriptor - 1) / counters_in_descriptor)
{}

void init()
{
if (m_counter == 0) {
auto first = reinterpret_cast<std::byte *>(&m_accumulators.front());
auto last =
std::next(reinterpret_cast<std::byte *>(&m_accumulators.back()), sizeof(Block));
std::fill(first, last, std::byte{0});
}
}

float &operator[](std::ptrdiff_t const document) {
auto const block = document / counters_in_descriptor;
auto const pos_in_block = document % counters_in_descriptor;
if (//m_accumulators[block].accumulators[pos_in_block] > 0 &&
m_accumulators[block].counter(pos_in_block) != m_counter)
{
auto const shift = pos_in_block * counter_bit_size;
m_accumulators[block].descriptor &= ~(mask << shift);
m_accumulators[block].descriptor |= m_counter << shift;
m_accumulators[block].accumulators[pos_in_block] = 0;
}
return m_accumulators[block].accumulators[pos_in_block];
}

void accumulate(std::ptrdiff_t const document, float score)
{
auto const block = document / counters_in_descriptor;
auto const pos_in_block = document % counters_in_descriptor;
if (m_accumulators[block].counter(pos_in_block) != m_counter) {
m_accumulators[block].reset_counter(pos_in_block, m_counter);
}
m_accumulators[block].accumulators[pos_in_block] += score;
}

void aggregate(topk_queue &topk) {
uint64_t docid = 0u;
for (auto const &block : m_accumulators) {
int pos = 0;
for (auto const &score : block.accumulators) {
if (block.counter(pos++) == m_counter) {
topk.insert(score, docid);
}
++docid;
}
};
m_counter = (m_counter + 1) % cycle;
}

[[nodiscard]] auto size() const noexcept -> std::size_t { return m_size; }
[[nodiscard]] auto blocks() noexcept -> std::vector<Block> & { return m_accumulators; }
[[nodiscard]] auto counter() const noexcept -> int { return m_counter; }

private:
std::size_t m_size;
std::vector<Block> m_accumulators;
int m_counter{};
};

}
15 changes: 15 additions & 0 deletions include/pisa/accumulator/simple_accumulator.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#pragma once

namespace pisa {

struct Simple_Accumulator : public std::vector<float> {
Simple_Accumulator(std::ptrdiff_t size) : std::vector<float>(size) {}
void init() { std::fill(begin(), end(), 0.0); }
void accumulate(uint32_t doc, float score) { operator[](doc) += score; }
void aggregate(topk_queue &topk) {
uint64_t docid = 0u;
std::for_each(begin(), end(), [&](auto score) { topk.insert(score, docid++); });
}
};

}
21 changes: 21 additions & 0 deletions include/pisa/block_posting_list.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,8 @@ namespace pisa {

class document_enumerator {
public:
using enumerator_category = pisa::block_enumerator_tag;

document_enumerator(uint8_t const* data, uint64_t universe,
size_t term_id = 0)
: m_n(0) // just to silence warnings
Expand Down Expand Up @@ -156,6 +158,25 @@ namespace pisa {
}
}

// TODO(michal): I recommend using some view, like gsl::span or something
// instead of a reference to a vector.
[[nodiscard]] auto document_buffer() -> std::vector<uint32_t> const & {
return m_docs_buf;
}

[[nodiscard]] auto frequency_buffer() -> std::vector<uint32_t> const & {
if (!m_freqs_decoded) {
decode_freqs_block();
}
return m_freqs_buf;
}

void next_block()
{
m_pos_in_block = m_cur_block_size - 1;
next();
}

uint64_t docid() const
{
return m_cur_docid;
Expand Down
1 change: 1 addition & 0 deletions include/pisa/freq_index.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ namespace pisa {

class document_enumerator {
public:
using enumerator_category = pisa::input_enumerator_tag;
void reset()
{
m_cur_pos = 0;
Expand Down
14 changes: 9 additions & 5 deletions include/pisa/query/algorithm/and_query.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@

namespace pisa {

template <bool with_freqs>
template <typename Index, bool with_freqs>
struct and_query {

template <typename Index>
uint64_t operator()(Index const &index, term_id_vec terms) const {
and_query(Index const &index) : m_index(index) {}

uint64_t operator()(term_id_vec terms) const {
if (terms.empty())
return 0;
remove_duplicate_terms(terms);
Expand All @@ -16,7 +17,7 @@ struct and_query {
enums.reserve(terms.size());

for (auto term : terms) {
enums.push_back(index[term]);
enums.push_back(m_index[term]);
}

// sort by increasing frequency
Expand All @@ -27,7 +28,7 @@ struct and_query {
uint64_t results = 0;
uint64_t candidate = enums[0].docid();
size_t i = 1;
while (candidate < index.num_docs()) {
while (candidate < m_index.num_docs()) {
for (; i < enums.size(); ++i) {
enums[i].next_geq(candidate);
if (enums[i].docid() != candidate) {
Expand All @@ -52,6 +53,9 @@ struct and_query {
}
return results;
}

private:
Index const &m_index;
};

} // namespace pisa
17 changes: 9 additions & 8 deletions include/pisa/query/algorithm/block_max_maxscore_query.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,22 @@

namespace pisa {

template <typename WandType>
template <typename Index, typename WandType>
struct block_max_maxscore_query {

typedef bm25 scorer_type;

block_max_maxscore_query(WandType const &wdata, uint64_t k) : m_wdata(&wdata), m_topk(k) {}
block_max_maxscore_query(Index const &index, WandType const &wdata, uint64_t k)
: m_index(index), m_wdata(&wdata), m_topk(k) {}

template <typename Index>
uint64_t operator()(Index const &index, term_id_vec const &terms) {
uint64_t operator()(term_id_vec const &terms) {
m_topk.clear();
if (terms.empty())
return 0;

auto query_term_freqs = query_freqs(terms);

uint64_t num_docs = index.num_docs();
uint64_t num_docs = m_index.num_docs();
typedef typename Index::document_enumerator enum_type;
typedef typename WandType::wand_data_enumerator wdata_enum;

Expand All @@ -32,7 +32,7 @@ struct block_max_maxscore_query {
enums.reserve(query_term_freqs.size());

for (auto term : query_term_freqs) {
auto list = index[term.first];
auto list = m_index[term.first];
auto w_enum = m_wdata->getenum(term.first);
auto q_weight = scorer_type::query_term_weight(term.second, list.size(), num_docs);
auto max_weight = q_weight * m_wdata->max_term_weight(term.first);
Expand Down Expand Up @@ -66,10 +66,10 @@ struct block_max_maxscore_query {
})
->docs_enum.docid();

while (non_essential_lists < ordered_enums.size() && cur_doc < index.num_docs()) {
while (non_essential_lists < ordered_enums.size() && cur_doc < m_index.num_docs()) {
float score = 0;
float norm_len = m_wdata->norm_len(cur_doc);
uint64_t next_doc = index.num_docs();
uint64_t next_doc = m_index.num_docs();
for (size_t i = non_essential_lists; i < ordered_enums.size(); ++i) {
if (ordered_enums[i]->docs_enum.docid() == cur_doc) {
score +=
Expand Down Expand Up @@ -129,6 +129,7 @@ struct block_max_maxscore_query {
std::vector<std::pair<float, uint64_t>> const &topk() const { return m_topk.topk(); }

private:
Index const & m_index;
WandType const *m_wdata;
topk_queue m_topk;
};
Expand Down
Loading