Skip to content

Commit

Permalink
Replace StellarIndex
Browse files Browse the repository at this point in the history
  • Loading branch information
eaasna committed Jul 11, 2024
1 parent bb362e0 commit 2a71184
Show file tree
Hide file tree
Showing 6 changed files with 188 additions and 35 deletions.
138 changes: 138 additions & 0 deletions include/dream_stellar/stellar_index.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
// ==========================================================================
// STELLAR - SwifT Exact LocaL AligneR
// http://www.seqan.de/projects/stellar/
// ==========================================================================
// Copyright (C) 2010-2012 by Birte Kehr
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 3 of the License, or (at your options) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//
// ==========================================================================
// Author: Birte Kehr <[email protected]>
// ==========================================================================
#pragma once

#include <seqan/index.h>

#include <span>

#include <utilities/alphabet_wrapper/matcher/stellar_matcher.hpp>
#include <stellar/options/index_options.hpp>

namespace dream_stellar
{
using namespace seqan2;
using namespace stellar;

template <typename TAlphabet, typename TString = String<TAlphabet>, typename TInfixSegment = seqan2::Segment<TString const, seqan2::InfixSegment>>
using StellarQGramStringSet = StringSet<TInfixSegment, Owner<> >;

template <typename TAlphabet>
using StellarQGramIndex = Index<StellarQGramStringSet<TAlphabet> const, IndexQGram<SimpleShape, OpenAddressing> >;

template <typename TAlphabet>
using StellarSwiftPattern = Pattern<StellarQGramIndex<TAlphabet>, Swift<SwiftLocal> >;

template <typename TAlphabet>
using StellarSwiftFinder = Finder<Segment<String<TAlphabet> const, InfixSegment> const, Swift<SwiftLocal> >;

template <typename TAlphabet>
struct StellarIndex
{
using TSequence = seqan2::String<TAlphabet>;
using TInfixSegment = seqan2::Segment<seqan2::String<TAlphabet> const, seqan2::InfixSegment>;
using TQGramStringSet = StellarQGramStringSet<TAlphabet>;

template <typename TSpec>
StellarIndex(StringSet<TSequence, TSpec> const & queries, IndexOptions const & options)
: StellarIndex{convertImplStringSet(queries), options}
{}

StellarIndex(StringSet<TInfixSegment> const & queries, IndexOptions const & options)
: StellarIndex{convertSegmentStringSet(queries), options}
{}

StellarIndex(std::span<TInfixSegment> const & queries, IndexOptions const & options)
: StellarIndex{convertImplSpan(queries), options}
{}

StellarIndex() = delete;
StellarIndex(StellarIndex &&) = delete;
StellarIndex(StellarIndex const &) = delete;
StellarIndex & operator=(StellarIndex &&) = delete;
StellarIndex & operator=(StellarIndex const &) = delete;

void construct()
{
indexRequire(qgramIndex, QGramSADir());
}

StellarSwiftPattern<TAlphabet> createSwiftPattern()
{
return {qgramIndex};
}

static StellarQGramIndex<TAlphabet> & qgramIndexFromPattern(StellarSwiftPattern<TAlphabet> & pattern)
{
return host(pattern);
}

static TQGramStringSet const & sequencesFromPattern(StellarSwiftPattern<TAlphabet> & pattern)
{
return sequencesFromQGramIndex(qgramIndexFromPattern(pattern));
}

static TQGramStringSet const & sequencesFromQGramIndex(StellarQGramIndex<TAlphabet> & index)
{
return indexText(index);
}

TQGramStringSet const dependentQueries;
StellarQGramIndex<TAlphabet> qgramIndex;

private:
template <typename TOtherQGramStringSet, typename = std::enable_if_t<std::is_same_v<TOtherQGramStringSet, TQGramStringSet>>>
StellarIndex(TOtherQGramStringSet && queries, IndexOptions const & options)
: dependentQueries{std::forward<TOtherQGramStringSet>(queries)}, qgramIndex{dependentQueries}
{
resize(indexShape(qgramIndex), options.qGram);
cargo(qgramIndex).abundanceCut = options.qgramAbundanceCut;
}

template <typename TSpec>
static TQGramStringSet convertImplStringSet(StringSet<TSequence, TSpec> const & queries)
{
TQGramStringSet dependentQueries;
for (TSequence const & query: queries)
seqan2::appendValue(dependentQueries, seqan2::infix(query, 0, seqan2::length(query)));

return dependentQueries;
}

static TQGramStringSet convertSegmentStringSet(StringSet<TInfixSegment> const & queries)
{
TQGramStringSet dependentQueries = queries;
return dependentQueries;
}

static TQGramStringSet convertImplSpan(std::span<TInfixSegment> const & queries)
{
TQGramStringSet dependentQueries;
for (TInfixSegment const & query: queries)
seqan2::appendValue(dependentQueries, query);

return dependentQueries;
}
};

} // namespace dream_stellar
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,9 @@

#pragma once

#include <dream_stellar/stellar_index.hpp>
#include <seqan/basic/alphabet_simple_type.h>
#include <utilities/alphabet_wrapper/seqan/container_adapter.hpp>

#include <seqan/index.h>

#include <utilities/alphabet_wrapper/matcher/seqan_pattern_base.hpp>

namespace jst::contrib
Expand Down Expand Up @@ -167,6 +166,8 @@ namespace jst::contrib

} while (true);
}

friend struct dream_stellar::StellarIndex<seqan2::Dna>;
};

/////////////////////////////////////////////////////////////////////////////////
Expand Down
59 changes: 31 additions & 28 deletions include/valik/search/iterate_queries.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include <utilities/alphabet_wrapper/seqan/alphabet.hpp>
#include <valik/search/producer_threads_parallel.hpp>
#include <valik/search/search_time_statistics.hpp>
#include <valik/shared.hpp>

#include <stellar/utils/stellar_app_runtime.hpp>
#include <stellar/io/import_sequence.hpp>
Expand Down Expand Up @@ -42,51 +43,53 @@ void iterate_distributed_queries(search_arguments const & arguments,
}
}

struct adaptor_traits : seqan3::sequence_file_input_default_traits_dna
{
using sequence_alphabet = seqan2::alphabet_adaptor<seqan3::dna4>; // instead of dna5
using sequence_legal_alphabet = sequence_alphabet;
};

/**
* @brief Function that creates a query record from each query sequence and sends it to Stellar search.
*
* @param ref_seg_count Number of reference segments i.e the distribution granularity.
* @param arguments Command line arguments.
* @param queue Shopping cart queue for sending queries over to Stellar search.
*/
template <typename sequence_t>
template <typename TSequence>
void iterate_all_queries(size_t const ref_seg_count,
search_arguments const & arguments,
cart_queue<shared_query_record<sequence_t>> & queue)
{
/*
std::vector<sequence_type> seq_vec{};
for (auto & seq : rec_vec | std::views::transform([](record_type record) { return record.sequence(); }))
{
seq_vec.emplace_back(seq);
}
jst::contrib::stellar_matcher<sequence_type> matcher(seq_vec, (double) arguments.error_rate, (unsigned) arguments.minLength);
*/

std::vector<shared_query_record<sequence_t>> query_records{};
cart_queue<shared_query_record<TSequence>> & queue)
{
using TId = seqan2::CharString;
std::vector<shared_query_record<TSequence>> query_records{};

constexpr uint64_t chunk_size = (1ULL << 20) * 10;

seqan3::sequence_file_input<adaptor_traits> fin{std::istringstream{arguments.query_file}, seqan3::format_fasta{}};
using record_type = typename decltype(fin)::record_type;
using sequence_type = std::remove_cvref_t<decltype(std::declval<record_type>().sequence())>;
seqan2::SeqFileIn inSeqs;
if (!open(inSeqs, arguments.query_file.c_str()))
{
throw std::runtime_error("Failed to open " + arguments.query_file.string() + " file.");
}

std::set<TId> uniqueIds; // set of short IDs (cut at first whitespace)
bool idsUnique = true;

size_t seqCount{0};
std::vector<record_type> rec_vec{};
for (auto & record : fin)
{
rec_vec.emplace_back(record);
}

{ // try stellar_matcher
seqan3::sequence_file_input<dna4_adaptor_traits> fin{std::istringstream{arguments.query_file}, seqan3::format_fasta{}};
using record_type = typename decltype(fin)::record_type;
using sequence_type = std::remove_cvref_t<decltype(std::declval<record_type>().sequence())>;

std::vector<record_type> rec_vec{};
for (auto & record : fin)
{
rec_vec.emplace_back(record);
}

std::vector<sequence_type> seq_vec(rec_vec.size());
for (auto seq : rec_vec | std::views::transform([](record_type record) { return record.sequence(); }))
{
seq_vec.emplace_back(seq);
}
jst::contrib::stellar_matcher<sequence_type> matcher(seq_vec, (double) arguments.error_rate, (unsigned) arguments.minLength);
}

for (; !atEnd(inSeqs); ++seqCount)
{
TSequence seq{};
Expand Down
3 changes: 2 additions & 1 deletion include/valik/search/query_record.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ template <typename TSequence>
struct shared_query_record
{
std::string sequence_id;
std::vector<seqan2::alphabet_adaptor<seqan3::dna4>> sequence;
std::vector<seqan3::dna4> sequence;
//std::vector<seqan2::alphabet_adaptor<seqan3::dna4>> sequence;
seqan2::Segment<TSequence const, seqan2::InfixSegment> querySegment;
std::shared_ptr<TSequence> underlyingData;

Expand Down
9 changes: 6 additions & 3 deletions include/valik/search/search_local.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <stellar/io/import_sequence.hpp>
#include <stellar/utils/stellar_app_runtime.hpp>
#include <stellar3.shared.hpp>
#include <dream_stellar/stellar_index.hpp>

namespace valik::app
{
Expand Down Expand Up @@ -141,8 +142,10 @@ bool search_local(search_arguments & arguments, search_time_statistics & time_st
}
}

using TAlphabet = seqan2::alphabet_adaptor<seqan3::dna4>;
using TSequence = std::vector<TAlphabet>;
using TAlphabet = seqan2::Dna;
using TSequence = seqan2::String<TAlphabet>;
//using TAlphabet = seqan2::alphabet_adaptor<seqan3::dna4>;
//using TSequence = std::vector<TAlphabet>;

// the queue hands records over from the producer threads (valik prefiltering) to the consumer threads (stellar search)
auto queue = cart_queue<shared_query_record<TSequence>>{ref_meta.seg_count, arguments.cart_max_capacity, arguments.max_queued_carts};
Expand Down Expand Up @@ -241,7 +244,7 @@ bool search_local(search_arguments & arguments, search_time_statistics & time_st
stellar::_writeMoreCalculatedParams(threadOptions, threadOptions.referenceLength, queries, thread_meta.text_out);

auto swift_index_time = stellarThreadTime.swift_index_construction_time.now();
stellar::StellarIndex<TAlphabet> stellarIndex{queries, threadOptions};
dream_stellar::StellarIndex<TAlphabet> stellarIndex{queries, threadOptions};
stellar::StellarSwiftPattern<TAlphabet> swiftPattern = stellarIndex.createSwiftPattern();

// Construct index of the queries
Expand Down
7 changes: 7 additions & 0 deletions include/valik/shared.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include <vector>

#include <utilities/threshold/basics.hpp>
#include <utilities/alphabet_wrapper/seqan/alphabet.hpp>

#include <seqan3/io/sequence_file/input.hpp>
#include <seqan3/search/dream_index/interleaved_bloom_filter.hpp>
Expand Down Expand Up @@ -56,6 +57,12 @@ struct dna4_traits : seqan3::sequence_file_input_default_traits_dna
using sequence_alphabet = seqan3::dna4;
};

struct dna4_adaptor_traits : seqan3::sequence_file_input_default_traits_dna
{
using sequence_alphabet = seqan2::alphabet_adaptor<seqan3::dna4>; // instead of dna5
using sequence_legal_alphabet = sequence_alphabet;
};

struct split_arguments
{
std::vector<std::vector<std::string>> bin_path{};
Expand Down

0 comments on commit 2a71184

Please sign in to comment.