Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Query container usage #382

Draft
wants to merge 27 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
12d5fb6
Query contaier
elshize Apr 26, 2020
1d3aa88
Query container parsing
elshize Apr 26, 2020
ce2b720
Merge branch 'master' into query-container
elshize Apr 27, 2020
cdc17f3
CLI test
elshize Apr 27, 2020
98fe8b1
Merge branch 'query-container' of github.com:pisa-engine/pisa into qu…
elshize Apr 27, 2020
b0e5d1a
Fix .travis.yml syntax
elshize Apr 27, 2020
6e2ab62
Fix .travis.yml syntax
elshize Apr 27, 2020
2cce2cd
Fix when cli test are executed
elshize Apr 27, 2020
982d316
Merge branch 'master' into query-container
elshize Apr 28, 2020
1838258
Refactor out common code from tool
elshize Apr 28, 2020
3ba4588
Merge branch 'master' into query-container
elshize Apr 29, 2020
7107f65
Small refactoring and term resolver tests
elshize May 1, 2020
ede9c98
Fix tool description
elshize May 1, 2020
b8f625c
Multiple thresholds per query
elshize May 3, 2020
78cf15c
Return program with 1 if fails
elshize May 3, 2020
bee2fc3
Partial query container usage
elshize May 3, 2020
0a94dca
Replace Query with QueryContainer
elshize May 6, 2020
3e4a58c
Add evaluate_queries CLI test
elshize May 6, 2020
5a8fa5e
Add missing include
elshize May 6, 2020
11593e6
Merge branch 'master' into query-container-usage
elshize Jun 2, 2020
68e7055
Merge branch 'master' into query-container-usage
elshize Jun 5, 2020
e1c0000
Fix merge issues
elshize Jun 5, 2020
65ae025
Fix formatting
elshize Jun 5, 2020
a9a2359
Fix formatting
elshize Jun 5, 2020
4189626
Fix formatting
elshize Jun 5, 2020
c35c5a0
Fix formatting
elshize Jun 5, 2020
d83abe7
Add missing header
elshize Jun 6, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,19 @@ matrix:
apt:
sources: *all_sources
packages: ['clang-6.0', 'libstdc++-7-dev']
env: MATRIX_EVAL="CC=clang-6.0 && CXX=clang++-6.0 && COVERAGE=Off"
env: MATRIX_EVAL="CC=clang-6.0 && CXX=clang++-6.0 && COVERAGE=Off && TEST_CLI=On"

# Install dependencies
before_install:
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then
brew install ccache;
export PATH="/usr/local/opt/ccache/libexec:$PATH";
fi
- if [[ "$TEST_CLI" == "On" ]]; then
git clone https://github.com/sstephenson/bats.git;
cd bats;
sudo ./install.sh /usr/local;
fi
- eval "${MATRIX_EVAL}"

script:
Expand All @@ -55,6 +60,9 @@ script:
make -j2;
if [[ "$TIDY" != "On" ]]; then
CTEST_OUTPUT_ON_FAILURE=TRUE ctest -j2;
if [[ "$TEST_CLI" == "On" ]]; then
bash ../test/cli/run.sh;
fi
fi
fi
- if [[ "$CLANG_FORMAT" == "On" ]]; then
Expand Down
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ target_link_libraries(pisa PUBLIC # TODO(michal): are there any of these we can
spdlog
fmt::fmt
range-v3
nlohmann_json::nlohmann_json
)
target_include_directories(pisa PUBLIC external)

Expand Down
2 changes: 1 addition & 1 deletion include/pisa/binary_collection.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ class base_binary_collection {

auto const& operator*() const { return m_cur_seq; }

auto const* operator->() const { return &m_cur_seq; }
auto const* operator-> () const { return &m_cur_seq; }

base_iterator& operator++()
{
Expand Down
34 changes: 18 additions & 16 deletions include/pisa/cursor/block_max_scored_cursor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
#include <vector>

#include "cursor/max_scored_cursor.hpp"
#include "query/queries.hpp"
#include "query.hpp"
#include "scorer/index_scorer.hpp"
#include "wand_data.hpp"

Expand Down Expand Up @@ -42,23 +42,25 @@ class BlockMaxScoredCursor: public MaxScoredCursor<Cursor> {

template <typename Index, typename WandType, typename Scorer>
[[nodiscard]] auto make_block_max_scored_cursors(
Index const& index, WandType const& wdata, Scorer const& scorer, Query query)
Index const& index, WandType const& wdata, Scorer const& scorer, QueryRequest query)
{
auto terms = query.terms;
auto query_term_freqs = query_freqs(terms);

std::vector<BlockMaxScoredCursor<typename Index::document_enumerator, WandType>> cursors;
cursors.reserve(query_term_freqs.size());
using cursor_type = BlockMaxScoredCursor<typename Index::document_enumerator, WandType>;
auto term_ids = query.term_ids();
auto term_weights = query.term_weights();
std::vector<cursor_type> cursors;
cursors.reserve(term_ids.size());
std::transform(
query_term_freqs.begin(), query_term_freqs.end(), std::back_inserter(cursors), [&](auto&& term) {
float weight = term.second;
auto max_weight = weight * wdata.max_term_weight(term.first);
return BlockMaxScoredCursor<typename Index::document_enumerator, WandType>(
std::move(index[term.first]),
scorer.term_scorer(term.first),
weight,
max_weight,
wdata.getenum(term.first));
term_ids.begin(),
term_ids.end(),
term_weights.begin(),
std::back_inserter(cursors),
[&](auto term_id, auto weight) {
auto max_weight = weight * wdata.max_term_weight(term_id);
return cursor_type{index[term_id],
scorer.term_scorer(term_id),
weight,
max_weight,
wdata.getenum(term_id)};
});
return cursors;
}
Expand Down
18 changes: 8 additions & 10 deletions include/pisa/cursor/cursor.hpp
Original file line number Diff line number Diff line change
@@ -1,21 +1,19 @@
#pragma once

#include "query/queries.hpp"
#include <vector>

#include "query.hpp"

namespace pisa {

template <typename Index>
[[nodiscard]] auto make_cursors(Index const& index, Query query)
[[nodiscard]] auto make_cursors(Index const& index, QueryRequest query)
{
auto terms = query.terms;
remove_duplicate_terms(terms);
using cursor = typename Index::document_enumerator;

std::vector<cursor> cursors;
cursors.reserve(terms.size());
std::transform(terms.begin(), terms.end(), std::back_inserter(cursors), [&](auto&& term) {
return index[term];
auto term_ids = query.term_ids();
std::vector<typename Index::document_enumerator> cursors;
cursors.reserve(term_ids.size());
std::transform(term_ids.begin(), term_ids.end(), std::back_inserter(cursors), [&](auto&& term_id) {
return index[term_id];
});

return cursors;
Expand Down
27 changes: 15 additions & 12 deletions include/pisa/cursor/max_scored_cursor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include <vector>

#include "cursor/scored_cursor.hpp"
#include "query.hpp"
#include "query/queries.hpp"
#include "wand_data.hpp"

Expand Down Expand Up @@ -31,20 +32,22 @@ class MaxScoredCursor: public ScoredCursor<Cursor> {
};

template <typename Index, typename WandType, typename Scorer>
[[nodiscard]] auto
make_max_scored_cursors(Index const& index, WandType const& wdata, Scorer const& scorer, Query query)
[[nodiscard]] auto make_max_scored_cursors(
Index const& index, WandType const& wdata, Scorer const& scorer, QueryRequest query)
{
auto terms = query.terms;
auto query_term_freqs = query_freqs(terms);

std::vector<MaxScoredCursor<typename Index::document_enumerator>> cursors;
cursors.reserve(query_term_freqs.size());
using cursor_type = MaxScoredCursor<typename Index::document_enumerator>;
auto term_ids = query.term_ids();
auto term_weights = query.term_weights();
std::vector<cursor_type> cursors;
cursors.reserve(term_ids.size());
std::transform(
query_term_freqs.begin(), query_term_freqs.end(), std::back_inserter(cursors), [&](auto&& term) {
float query_weight = term.second;
auto max_weight = query_weight * wdata.max_term_weight(term.first);
return MaxScoredCursor<typename Index::document_enumerator>(
index[term.first], scorer.term_scorer(term.first), query_weight, max_weight);
term_ids.begin(),
term_ids.end(),
term_weights.begin(),
std::back_inserter(cursors),
[&](auto term_id, auto weight) {
auto max_weight = weight * wdata.max_term_weight(term_id);
return cursor_type{index[term_id], scorer.term_scorer(term_id), weight, max_weight};
});
return cursors;
}
Expand Down
23 changes: 13 additions & 10 deletions include/pisa/cursor/scored_cursor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

#include <vector>

#include "query/queries.hpp"
#include "query.hpp"
#include "scorer/index_scorer.hpp"
#include "wand_data.hpp"

Expand Down Expand Up @@ -45,17 +45,20 @@ class ScoredCursor {
};

template <typename Index, typename Scorer>
[[nodiscard]] auto make_scored_cursors(Index const& index, Scorer const& scorer, Query query)
[[nodiscard]] auto make_scored_cursors(Index const& index, Scorer const& scorer, QueryRequest query)
{
auto terms = query.terms;
auto query_term_freqs = query_freqs(terms);

std::vector<ScoredCursor<typename Index::document_enumerator>> cursors;
cursors.reserve(query_term_freqs.size());
using cursor_type = ScoredCursor<typename Index::document_enumerator>;
auto term_ids = query.term_ids();
auto term_weights = query.term_weights();
std::vector<cursor_type> cursors;
cursors.reserve(term_ids.size());
std::transform(
query_term_freqs.begin(), query_term_freqs.end(), std::back_inserter(cursors), [&](auto&& term) {
return ScoredCursor<typename Index::document_enumerator>(
index[term.first], scorer.term_scorer(term.first), term.second);
term_ids.begin(),
term_ids.end(),
term_weights.begin(),
std::back_inserter(cursors),
[&](auto term_id, auto weight) {
return cursor_type{index[term_id], scorer.term_scorer(term_id), weight};
});
return cursors;
}
Expand Down
46 changes: 24 additions & 22 deletions include/pisa/intersection.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@
#include <variant>
#include <vector>

#include <gsl/span>

#include "query.hpp"
#include "query/algorithm/and_query.hpp"
#include "query/queries.hpp"
#include "scorer/scorer.hpp"

namespace pisa {
Expand All @@ -23,22 +25,18 @@ namespace intersection {
using Mask = std::bitset<MAX_QUERY_LEN_EXP>;

/// Returns a filtered copy of `query` containing only terms indicated by ones in the bit mask.
[[nodiscard]] inline auto filter(Query const& query, Mask mask) -> Query
[[nodiscard]] inline auto filter(QueryContainer const& query, Mask mask) -> QueryContainer
{
if (query.terms.size() > MAX_QUERY_LEN) {
throw std::invalid_argument("Queries can be at most 2^32 terms long");
}
std::vector<std::uint32_t> terms;
std::vector<float> weights;
for (std::size_t bitpos = 0; bitpos < query.terms.size(); ++bitpos) {
if (((1U << bitpos) & mask.to_ulong()) > 0) {
terms.push_back(query.terms.at(bitpos));
if (bitpos < query.term_weights.size()) {
weights.push_back(query.term_weights[bitpos]);
}
std::vector<std::size_t> positions;
for (std::size_t bitpos = 0; mask.any(); ++bitpos) {
if (mask.test(bitpos)) {
positions.push_back(bitpos);
mask.reset(bitpos);
}
}
return Query{query.id, terms, weights};
QueryContainer filtered_query(query);
filtered_query.filter_terms(positions);
return filtered_query;
}
} // namespace intersection

Expand All @@ -53,19 +51,23 @@ struct Intersection {
inline static auto compute(
Index const& index,
Wand const& wand,
Query const& query,
QueryContainer const& query,
std::optional<intersection::Mask> term_mask = std::nullopt) -> Intersection;
};

template <typename Index, typename Wand>
inline auto Intersection::compute(
Index const& index, Wand const& wand, Query const& query, std::optional<intersection::Mask> term_mask)
-> Intersection
Index const& index,
Wand const& wand,
QueryContainer const& query,
std::optional<intersection::Mask> term_mask) -> Intersection
{
auto filtered_query = term_mask ? intersection::filter(query, *term_mask) : query;
scored_and_query retrieve{};
auto scorer = scorer::from_params(ScorerParams("bm25"), wand);
auto results = retrieve(make_scored_cursors(index, *scorer, filtered_query), index.num_docs());
auto results = retrieve(
make_scored_cursors(index, *scorer, filtered_query.query(query::unlimited)),
index.num_docs());
auto max_element = [&](auto const& vec) -> float {
auto order = [](auto const& lhs, auto const& rhs) { return lhs.second < rhs.second; };
if (auto pos = std::max_element(results.begin(), results.end(), order); pos != results.end()) {
Expand All @@ -78,14 +80,14 @@ inline auto Intersection::compute(
}

/// Do `func` for all intersections in a query that have a given maximum number of terms.
/// `Fn` takes `Query` and `Mask`.
/// `Fn` takes `QueryContainer` and `Mask`.
template <typename Fn>
auto for_all_subsets(Query const& query, std::optional<std::uint8_t> max_term_count, Fn func)
auto for_all_subsets(QueryContainer const& query, std::optional<int> max_term_count, Fn func)
{
auto subset_count = 1U << query.terms.size();
auto subset_count = 1U << query.term_ids()->size();
for (auto subset = 1U; subset < subset_count; ++subset) {
auto mask = intersection::Mask(subset);
if (!max_term_count || mask.count() <= *max_term_count) {
if (!max_term_count || (mask.count() <= *max_term_count)) {
func(query, mask);
}
}
Expand Down
Loading