diff --git a/CMakeLists.txt b/CMakeLists.txt index 1ce21cc..0bba9e5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -232,45 +232,18 @@ else() set_target_properties(alignment-writer PROPERTIES EXCLUDE_FROM_ALL 1) set(CMAKE_ALIGNMENT_WRITER_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/external/alignment-writer/include) set(CMAKE_ALIGNMENT_WRITER_LIBRARY ${CMAKE_CURRENT_BINARY_DIR}/lib/libalignment-writer.a) + set(CMAKE_BITMAGIC_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/external/alignment-writer/external/BitMagic-7.12.3/src/) endif() include_directories(${CMAKE_ALIGNMENT_WRITER_HEADERS}) target_link_libraries(mSWEEP ${CMAKE_ALIGNMENT_WRITER_LIBRARY}) -## telescope -if (DEFINED CMAKE_TELESCOPE_LIBRARY AND DEFINED CMAKE_TELESCOPE_HEADERS) - message(STATUS "telescope headers provided in: ${CMAKE_TELESCOPE_HEADERS}") - message(STATUS "telescope library provided in: ${CMAKE_TELESCOPE_LIBRARY}") +## BitMagic +if (DEFINED CMAKE_BITMAGIC_HEADERS) + message(STATUS "BitMagic headers provided in: ${CMAKE_BITMAGIC_HEADERS}") else() - FetchContent_Declare(telescope - GIT_REPOSITORY https://github.com/tmaklin/telescope.git - GIT_TAG v0.7.3 - PREFIX "external" - SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/external/telescope" - BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/external/telescope" - BUILD_IN_SOURCE 0 - CMAKE_ARGS -D CMAKE_BXZSTR_HEADERS=${CMAKE_BXZSTR_HEADERS} - -D CMAKE_CXXARGS_HEADERS=${CMAKE_CXXARGS_HEADERS} - -D CMAKE_CXXIO_HEADERS=${CMAKE_CXXIO_HEADERS} - -D CMAKE_ALIGNMENT_WRITER_HEADERS=${CMAKE_ALIGNMENT_WRITER_HEADERS} - -D CMAKE_ALIGNMENT_WRITER_LIBRARY=${CMAKE_ALIGNMENT_WRITER_LIBRARY} - -D CMAKE_BITMAGIC_HEADERS=${CMAKE_CURRENT_SOURCE_DIR}/external/telescope/external/BitMagic-7.12.3/src - -D CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} - -D "CMAKE_C_FLAGS=${CMAKE_C_FLAGS}" - -D "CMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}" - -D "CMAKE_C_COMPILER=${CMAKE_C_COMPILER}" - -D "CMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}" - INSTALL_COMMAND "" - ) - FetchContent_MakeAvailable(telescope) - add_dependencies(telescope libalignmentwriter) - add_dependencies(mSWEEP telescope) - set_target_properties(telescope PROPERTIES EXCLUDE_FROM_ALL 1) - set(CMAKE_TELESCOPE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/external/telescope/include) - set(CMAKE_TELESCOPE_LIBRARY ${CMAKE_CURRENT_BINARY_DIR}/lib/libtelescope.a) - set(CMAKE_BITMAGIC_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/external/telescope/external/BitMagic-7.12.3/src) + message(FATAL_ERROR "Provide BitMagic C++ headers with -DCMAKE_BITMAGIC_HEADERS") endif() -include_directories(${CMAKE_TELESCOPE_HEADERS} ${CMAKE_BITMAGIC_HEADERS}) -target_link_libraries(mSWEEP ${CMAKE_TELESCOPE_LIBRARY}) +include_directories(${CMAKE_BITMAGIC_HEADERS}) ## seamat if (DEFINED CMAKE_SEAMAT_HEADERS) @@ -283,6 +256,7 @@ else() SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/external/seamat" BUILD_IN_SOURCE 0 CMAKE_ARGS -D CMAKE_BUILD_TESTS=0 + -D CMAKE_BITMAGIC_HEADERS=${CMAKE_BITMAGIC_HEADERS} BUILD_COMMAND "" CONFIGURE_COMMAND "" INSTALL_COMMAND "" @@ -332,7 +306,7 @@ if (DEFINED CMAKE_MGEMS_LIBRARY AND DEFINED CMAKE_MGEMS_HEADERS) else() FetchContent_Declare(mGEMS GIT_REPOSITORY https://github.com/PROBIC/mGEMS.git - GIT_TAG v1.3.0 + GIT_TAG v1.3.3 PREFIX "external" SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/external/mGEMS" BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/external/mGEMS" @@ -341,9 +315,8 @@ else() -D CMAKE_CXXARGS_HEADERS=${CMAKE_CXXARGS_HEADERS} -D CMAKE_CXXIO_HEADERS=${CMAKE_CXXIO_HEADERS} -D CMAKE_ALIGNMENT_WRITER_HEADERS=${CMAKE_ALIGNMENT_WRITER_HEADERS} - -D CMAKE_BITMAGIC_HEADERS=${CMAKE_CURRENT_BINARY_DIR}/external/telescope/external/BitMagic-7.12.3/src + -D CMAKE_BITMAGIC_HEADERS=${CMAKE_BITMAGIC_HEADERS} -D CMAKE_SEAMAT_HEADERS=${CMAKE_SEAMAT_HEADERS} - -D CMAKE_TELESCOPE_HEADERS=${CMAKE_TELESCOPE_HEADERS} -D CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -D "CMAKE_C_FLAGS=${CMAKE_C_FLAGS}" -D "CMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}" @@ -352,13 +325,13 @@ else() INSTALL_COMMAND "" ) FetchContent_MakeAvailable(mGEMS) - add_dependencies(mGEMS telescope libalignmentwriter) + add_dependencies(mGEMS libalignmentwriter) add_dependencies(mSWEEP libmgems) set_target_properties(mGEMS PROPERTIES EXCLUDE_FROM_ALL 1) - set(CMAKE_MGEMS_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/external/mGEMS/include) + set(CMAKE_MGEMS_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/external/mGEMS/include ${CMAKE_CURRENT_BINARY_DIR}/external/mGEMS/include) set(CMAKE_MGEMS_LIBRARY ${CMAKE_CURRENT_BINARY_DIR}/external/mGEMS/lib/libmgems.a) endif() -target_link_libraries(mSWEEP ${CMAKE_MGEMS_LIBRARY} ${CMAKE_TELESCOPE_LIBRARY} ${CMAKE_ALIGNMENT_WRITER_LIBRARY}) +target_link_libraries(mSWEEP ${CMAKE_MGEMS_LIBRARY} ${CMAKE_ALIGNMENT_WRITER_LIBRARY}) include_directories(${CMAKE_MGEMS_HEADERS}) include_directories( diff --git a/include/Likelihood.hpp b/include/Likelihood.hpp index 88d0926..69d2c85 100644 --- a/include/Likelihood.hpp +++ b/include/Likelihood.hpp @@ -26,7 +26,6 @@ #define MSWEEP_LIKELIHOOD_HPP #include "Matrix.hpp" -#include "telescope.hpp" #include "mSWEEP_openmp_config.hpp" @@ -41,6 +40,7 @@ #include #include +#include "mSWEEP_alignment.hpp" #include "Grouping.hpp" namespace mSWEEP { @@ -106,52 +106,91 @@ class LL_WOR21 : public Likelihood { return ll_mat; } - void fill_ll_mat(const telescope::Alignment &alignment, const std::vector &group_sizes, const size_t n_groups, const size_t min_hits) { + void fill_ll_mat(const mSWEEP::Alignment &alignment, const std::vector &group_sizes, const size_t n_groups, const size_t min_hits) { size_t num_ecs = alignment.n_ecs(); + size_t n_targets = alignment.get_n_targets(); + + size_t n_threads = 1; +#if defined(MSWEEP_OPENMP_SUPPORT) && (MSWEEP_OPENMP_SUPPORT) == 1 +#pragma omp parallel + { + n_threads = omp_get_num_threads(); + } +#endif + + // This double loop is currently the slowest part in the input reading + std::vector>> local_counts(n_threads); +#pragma omp parallel for schedule(static) + for (size_t i = 0; i < num_ecs; ++i) { + for (size_t j = 0; j < n_targets; ++j) { + if (alignment(i, j)) { +#if defined(MSWEEP_OPENMP_SUPPORT) && (MSWEEP_OPENMP_SUPPORT) == 1 + local_counts[omp_get_thread_num()].inc((size_t)((size_t)alignment.get_groups()[j]*num_ecs) + i); +#else + local_counts[0].inc((size_t)((size_t)alignment.get_groups()[j]*num_ecs) + i); +#endif + } + } + } + + bm::sparse_vector> group_counts = std::move(local_counts[0]); + for (size_t i = 1; i < n_threads; ++i) { + group_counts.merge(local_counts[i]); + } bool mask_groups = min_hits > 0; this->groups_mask = std::vector(n_groups, !mask_groups); std::vector masked_group_sizes; + std::vector groups_pos(n_groups, 0); + size_t n_masked_groups = 0; if (mask_groups) { std::vector group_hit_counts(n_groups, (size_t)0); // Create mask identifying groups that have at least 1 alignment - for (size_t i = 0; i < num_ecs; ++i) { - for (size_t j = 0; j < n_groups; ++j) { - group_hit_counts[j] += (alignment(j, i) > 0) * alignment.reads_in_ec(i); +#pragma omp parallel for schedule(static) reduction(vec_size_t_plus:group_hit_counts) + for (size_t j = 0; j < n_groups; ++j) { + for (size_t i = 0; i < num_ecs; ++i) { + group_hit_counts[j] += (group_counts[j*num_ecs + i] > 0) * alignment.reads_in_ec(i); } } + for (size_t i = 0; i < n_groups; ++i) { this->groups_mask[i] = groups_mask[i] || (group_hit_counts[i] >= min_hits); if (this->groups_mask[i]) { + groups_pos[i] = n_masked_groups; masked_group_sizes.push_back(group_sizes[i]); + ++n_masked_groups; } } } else { masked_group_sizes = group_sizes; +#pragma omp parallel for schedule(static) + for (size_t i = 0; i < n_groups; ++i) { + groups_pos[i] = i; + } } - size_t n_masked_groups = masked_group_sizes.size(); + n_masked_groups = masked_group_sizes.size(); this->update_bb_parameters(masked_group_sizes, n_masked_groups, this->bb_constants); const seamat::DenseMatrix &precalc_lls_mat = this->precalc_lls(masked_group_sizes, n_masked_groups); this->log_likelihoods.resize(n_masked_groups, num_ecs, std::log(this->zero_inflation)); - for (size_t j = 0; j < num_ecs; ++j) { - size_t groups_pos = 0; - for (size_t i = 0; i < n_groups; ++i) { + +#pragma omp parallel for schedule(static) + for (size_t i = 0; i < n_groups; ++i) { if (this->groups_mask[i]) { - this->log_likelihoods(groups_pos, j) = precalc_lls_mat(groups_pos, alignment(i, j)); - ++groups_pos; + for (size_t j = 0; j < num_ecs; ++j) { + this->log_likelihoods(groups_pos[i], j) = precalc_lls_mat(groups_pos[i], group_counts[i*num_ecs + j]); + } } - } } } - void fill_ec_counts(const telescope::Alignment &alignment) { + void fill_ec_counts(const mSWEEP::Alignment &alignment) { // Fill log ec counts. this->log_ec_counts.resize(alignment.n_ecs(), 0); #pragma omp parallel for schedule(static) for (size_t i = 0; i < alignment.n_ecs(); ++i) { - this->log_ec_counts[i] = std::log(alignment.reads_in_ec(i)); + this->log_ec_counts[i] = std::log(alignment.reads_in_ec(i)); } } @@ -170,14 +209,14 @@ class LL_WOR21 : public Likelihood { public: LL_WOR21() = default; - LL_WOR21(const std::vector &group_sizes, const telescope::Alignment &alignment, const size_t n_groups, const T tol, const T frac_mu, const size_t min_hits, const T _zero_inflation) { + LL_WOR21(const std::vector &group_sizes, const mSWEEP::Alignment &alignment, const size_t n_groups, const T tol, const T frac_mu, const size_t min_hits, const T _zero_inflation) { this->bb_constants[0] = tol; this->bb_constants[1] = frac_mu; this->zero_inflation = _zero_inflation; this->from_grouped_alignment(alignment, group_sizes, n_groups, min_hits); } - void from_grouped_alignment(const telescope::Alignment &alignment, const std::vector &group_sizes, const size_t n_groups, const size_t min_hits) { + void from_grouped_alignment(const mSWEEP::Alignment &alignment, const std::vector &group_sizes, const size_t n_groups, const size_t min_hits) { this->fill_ll_mat(alignment, group_sizes, n_groups, min_hits); this->fill_ec_counts(alignment); } @@ -292,7 +331,7 @@ class LL_WOR21 : public Likelihood { const std::vector& groups_considered() const override { return this->groups_mask; }; }; template -std::unique_ptr> ConstructAdaptiveLikelihood(const telescope::Alignment &alignment, const Grouping &grouping, const T q, const T e, const size_t min_hits, const T zero_inflation) { +std::unique_ptr> ConstructAdaptiveLikelihood(const mSWEEP::Alignment &alignment, const Grouping &grouping, const T q, const T e, const size_t min_hits, const T zero_inflation) { size_t max_group_size = grouping.max_group_size(); size_t n_groups = grouping.get_n_groups(); std::unique_ptr> log_likelihoods; diff --git a/include/Sample.hpp b/include/Sample.hpp index 8a3ce79..b9eb3d0 100644 --- a/include/Sample.hpp +++ b/include/Sample.hpp @@ -33,7 +33,7 @@ #include #include "Matrix.hpp" -#include "telescope.hpp" +#include "mSWEEP_alignment.hpp" namespace mSWEEP { class Sample { @@ -47,7 +47,7 @@ class Sample { std::vector log_KLDs; protected: - void count_alignments(const telescope::Alignment &alignment); + void count_alignments(const mSWEEP::Alignment &alignment); public: // Virtual functions @@ -111,7 +111,7 @@ class PlainSample : public Sample { public: PlainSample() = default; - PlainSample(const telescope::Alignment &alignment) { + PlainSample(const mSWEEP::Alignment &alignment) { this->count_alignments(alignment); } @@ -132,7 +132,7 @@ class BinningSample : public PlainSample, public Binning { public: BinningSample() = default; - BinningSample(const telescope::Alignment &alignment) { + BinningSample(const mSWEEP::Alignment &alignment) { this->count_alignments(alignment); this->store_aligned_reads(alignment.get_aligned_reads()); } @@ -157,19 +157,19 @@ class BootstrapSample : public Sample { std::vector> bootstrap_results; // Set all variables required to bootstrap the ec_counts later - void init_bootstrap(const telescope::Alignment &alignment); + void init_bootstrap(const mSWEEP::Alignment &alignment); protected: - void construct(const telescope::Alignment &alignment, const size_t _iters, const int32_t seed, const size_t bootstrap_count=0); + void construct(const mSWEEP::Alignment &alignment, const size_t _iters, const int32_t seed, const size_t bootstrap_count=0); public: BootstrapSample() = default; // Set seed in constructor - BootstrapSample(const telescope::Alignment &alignment, const size_t _iters, const int32_t seed) { + BootstrapSample(const mSWEEP::Alignment &alignment, const size_t _iters, const int32_t seed) { this->construct(alignment, _iters, seed); } - BootstrapSample(const telescope::Alignment &alignment, const size_t _iters, const size_t _bootstrap_count, const int32_t seed) { + BootstrapSample(const mSWEEP::Alignment &alignment, const size_t _iters, const size_t _bootstrap_count, const int32_t seed) { this->construct(alignment, _iters, seed, _bootstrap_count); } @@ -191,18 +191,18 @@ class BootstrapSample : public Sample { class BinningBootstrap : public BootstrapSample, public Binning { public: - BinningBootstrap(const telescope::Alignment &alignment, const size_t _iters, const int32_t seed) { + BinningBootstrap(const mSWEEP::Alignment &alignment, const size_t _iters, const int32_t seed) { this->construct(alignment, _iters, seed); this->store_aligned_reads(alignment.get_aligned_reads()); } - BinningBootstrap(const telescope::Alignment &alignment, const size_t _iters, const size_t _bootstrap_count, const int32_t seed) { + BinningBootstrap(const mSWEEP::Alignment &alignment, const size_t _iters, const size_t _bootstrap_count, const int32_t seed) { this->construct(alignment, _iters, seed, _bootstrap_count); this->store_aligned_reads(alignment.get_aligned_reads()); } }; -void ConstructSample(const telescope::Alignment &alignment, const size_t bootstrap_iters, const size_t bootstrap_count, const size_t bootstrap_seed, const bool bin_reads, std::unique_ptr &sample); +void ConstructSample(const mSWEEP::Alignment &alignment, const size_t bootstrap_iters, const size_t bootstrap_count, const size_t bootstrap_seed, const bool bin_reads, std::unique_ptr &sample); } diff --git a/include/mSWEEP_alignment.hpp b/include/mSWEEP_alignment.hpp new file mode 100644 index 0000000..36aec10 --- /dev/null +++ b/include/mSWEEP_alignment.hpp @@ -0,0 +1,246 @@ +// mSWEEP: Estimate abundances of reference lineages in DNA sequencing reads. +// +// MIT License +// +// Copyright (c) 2023 Probabilistic Inference and Computational Biology group @ UH +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. +// +#ifndef MSWEEP_ALIGNMENT_HPP +#define MSWEEP_ALIGNMENT_HPP + +#include "bm64.h" +#include "unpack.hpp" + +#include "mSWEEP_openmp_config.hpp" + +#include +#include +#include +#include + +namespace mSWEEP { +class Alignment { +private: + size_t n_targets; + size_t n_queries; + std::vector group_indicators; + size_t n_groups; + std::vector> ec_read_ids; + std::vector ec_counts; + bm::bvector<> bits; + +public: + Alignment(size_t _n_targets) { + this->n_targets = _n_targets; + } + + void ReadPlaintextLine(const size_t n_targets, std::string &line, bm::bvector<>::bulk_insert_iterator &it) { + std::string part; + std::stringstream partition(line); + + // First column is read id (0-based indexing). + std::getline(partition, part, ' '); + size_t read_id = std::stoul(part); + + // Next columns contain the target sequence id (0-based indexing). + while (std::getline(partition, part, ' ')) { + *it = read_id*n_targets + std::stoul(part); // set bit `n_reads*n_refs + std::stoul(part)` as true + } + } + + size_t ReadPlaintextAlignment(const size_t n_targets, std::string &line, std::istream *stream, bm::bvector<> *ec_configs) { + bm::bvector<>::bulk_insert_iterator it(*ec_configs); // Bulk insert iterator buffers the insertions + + size_t n_reads = 1; + try { + // Contents of the first line is already stored in `line` + ReadPlaintextLine(n_targets, line, it); + + size_t compress_interval = 1000000; + while (std::getline(*stream, line)) { + // Insert each line into the alignment + ReadPlaintextLine(n_targets, line, it); + ++n_reads; + if (n_reads % compress_interval == 0) { + ec_configs->optimize(); + } + } + } catch (const std::exception &e) { + std::string msg(e.what()); + if (msg.find("stoul") != std::string::npos) { + throw std::runtime_error("File format not supported on line " + std::to_string(n_reads) + " with content: " + line); + } else { + throw std::runtime_error("Could not parse line " + std::to_string(n_reads) + " with content: " + line); + } + } + return n_reads; + } + + + void read(const std::string &merge_mode, std::vector &strands) { + for (size_t i = 0; i < strands.size(); ++i) { + std::string line; + std::getline(*strands[i], line); // Read the first line to check the format + size_t n_reads; + bm::bvector<> strand_alignment; + if (line.find(',') != std::string::npos) { + // First line contains a ','; stream could be in the compact format. + size_t n_refs; + alignment_writer::ReadHeader(line, &n_reads, &n_refs); + if (n_refs > this->n_targets) { + throw std::runtime_error("Pseudoalignment file has more target sequences than expected."); + } else if (this->n_targets < n_refs) { + throw std::runtime_error("Pseudoalignment file has less target sequences than expected."); + } + // Size is given on the header line. + strand_alignment.resize(n_reads*n_refs); + alignment_writer::UnpackData(strands[i], strand_alignment); + } else { + // Stream could be in the plaintext format. + // Size is unknown. + strand_alignment.set_new_blocks_strat(bm::BM_GAP); + n_reads = ReadPlaintextAlignment(n_targets, line, strands[i], &strand_alignment); + } + this->n_queries = n_reads; + + if (i == 0) { + this->bits = std::move(strand_alignment); + } else { + if (merge_mode == "intersection") { + this->bits.bit_and(strand_alignment); + } else if (merge_mode == "union") { + this->bits.bit_or(strand_alignment); + } else { + throw std::runtime_error("Unrecognized option `" + merge_mode + "` for --themisto-mode"); + } + } + } + } + + void collapse() { + size_t n_threads = 1; +#if defined(MSWEEP_OPENMP_SUPPORT) && (MSWEEP_OPENMP_SUPPORT) == 1 +#pragma omp parallel + { + n_threads = omp_get_num_threads(); + } +#endif + + std::vector>> mymap(n_threads); +#pragma omp parallel for schedule(static) + for (size_t i = 0; i < this->n_queries; ++i) { + if (bits.any_range(i*this->n_targets, (i + 1)*this->n_targets - 1)) { + size_t hash = 0; + for (size_t j = 0; j < this->n_targets; ++j) { + if (this->bits[i*this->n_targets + j]) { + hash ^= j + 0x517cc1b727220a95 + (hash << 6) + (hash >> 2); + } + } +#if defined(MSWEEP_OPENMP_SUPPORT) && (MSWEEP_OPENMP_SUPPORT) == 1 + auto got = mymap[omp_get_thread_num()].find(hash); + if (got == mymap[omp_get_thread_num()].end()) { + mymap[omp_get_thread_num()].insert(std::make_pair(hash, std::vector({(uint32_t)i}))); +#else + auto got = mymap[0].find(hash); + if (got == mymap[0].end()) { + mymap[0].insert(std::make_pair(hash, std::vector({(uint32_t)i}))); + +#endif + } else { + got->second.emplace_back(i); + } + } + } + + std::map> map; + for (size_t i = 0; i < n_threads; ++i) { + for (auto kv : mymap[i]) { + auto got = map.find(kv.first); + if (got == map.end()) { + map.insert(std::make_pair(kv.first, std::move(kv.second))); + } else { + got->second.insert(got->second.end(), + std::make_move_iterator(kv.second.begin()), + std::make_move_iterator(kv.second.end())); + } + } + } + + size_t n_ecs = map.size(); + this->ec_counts = std::vector(n_ecs, 0); + this->ec_read_ids = std::vector>(n_ecs); + size_t i = 0; + std::vector> collapsed_bits(n_threads); + +#pragma omp parallel + { + size_t i = 0; + size_t thread_id = 0; +#if defined(MSWEEP_OPENMP_SUPPORT) && (MSWEEP_OPENMP_SUPPORT) == 1 + thread_id = omp_get_thread_num(); +#endif + collapsed_bits[thread_id].resize(n_ecs*this->n_targets); + for(auto element = map.begin(); element !=map.end(); ++element, i++) { + if(i%n_threads == thread_id) { + this->ec_read_ids[i] = std::move(element->second); + this->ec_counts[i] = this->ec_read_ids[i].size(); + for (size_t k = 0; k < this->n_targets; ++k) { + collapsed_bits[thread_id][i*this->n_targets + k] = this->bits[this->ec_read_ids[i][0]*this->n_targets + k]; + } + } + } + } + + this->bits.clear(); + for (size_t i = 0; i < n_threads; ++i) { + this->bits.bit_or(collapsed_bits[i]); + } + } + + size_t n_ecs() const { return this->ec_counts.size(); }; + size_t n_reads() const { return this->n_queries; }; + size_t reads_in_ec(const size_t i) const { return this->ec_counts[i]; }; + size_t get_n_targets() const { return this->n_targets; }; + + bool operator()(const size_t row, const size_t col) const { return this->bits[row*this->n_targets + col]; } + + const std::vector>& get_aligned_reads() const { + return this->ec_read_ids; + } + + const std::vector& reads_assigned_to_ec(size_t ec_id) const { return this->ec_read_ids[ec_id]; } + + template + void add_groups(const std::vector &grouping) { + size_t _n_groups = 0; + this->group_indicators = std::vector(grouping.size()); + for (size_t i = 0; i < grouping.size(); ++i) { + group_indicators[i] = grouping[i]; + _n_groups = (n_groups > grouping[i] ? n_groups : grouping[i]); + } + this->n_groups = _n_groups + 1; + } + + const std::vector& get_groups() const { return this->group_indicators; }; + +}; +} + +#endif diff --git a/include/mSWEEP_openmp_config.hpp.in b/include/mSWEEP_openmp_config.hpp.in index 0c78efe..3b2f295 100644 --- a/include/mSWEEP_openmp_config.hpp.in +++ b/include/mSWEEP_openmp_config.hpp.in @@ -33,6 +33,9 @@ #pragma omp declare reduction(vec_double_plus : std::vector : \ std::transform(omp_out.begin(), omp_out.end(), omp_in.begin(), omp_out.begin(), std::plus())) \ initializer(omp_priv = decltype(omp_orig)(omp_orig.size())) +#pragma omp declare reduction(vec_size_t_plus : std::vector : \ + std::transform(omp_out.begin(), omp_out.end(), omp_in.begin(), omp_out.begin(), std::plus())) \ + initializer(omp_priv = decltype(omp_orig)(omp_orig.size())) #endif diff --git a/src/BootstrapSample.cpp b/src/BootstrapSample.cpp index 694baf9..cd52f56 100644 --- a/src/BootstrapSample.cpp +++ b/src/BootstrapSample.cpp @@ -30,7 +30,7 @@ #include "mSWEEP_version.h" namespace mSWEEP { -void BootstrapSample::init_bootstrap(const telescope::Alignment &alignment) { +void BootstrapSample::init_bootstrap(const mSWEEP::Alignment &alignment) { // Clear the bootstrap abundances in case we're estimating the same sample again. this->bootstrap_results = std::vector>(); @@ -43,7 +43,7 @@ void BootstrapSample::init_bootstrap(const telescope::Alignment &alignment) { ec_distribution = std::discrete_distribution(weights.begin(), weights.end()); } -void BootstrapSample::construct(const telescope::Alignment &alignment, const size_t _iters, const int32_t seed, const size_t _bootstrap_count) { +void BootstrapSample::construct(const mSWEEP::Alignment &alignment, const size_t _iters, const int32_t seed, const size_t _bootstrap_count) { this->count_alignments(alignment); if (seed == 26012023) { std::random_device rd; diff --git a/src/Sample.cpp b/src/Sample.cpp index 0a524b1..6fa16ee 100644 --- a/src/Sample.cpp +++ b/src/Sample.cpp @@ -27,7 +27,7 @@ #include namespace mSWEEP { -void ConstructSample(const telescope::Alignment &alignment, const size_t bootstrap_iters, const size_t bootstrap_count, const size_t bootstrap_seed, const bool bin_reads, std::unique_ptr &sample) { +void ConstructSample(const mSWEEP::Alignment &alignment, const size_t bootstrap_iters, const size_t bootstrap_count, const size_t bootstrap_seed, const bool bin_reads, std::unique_ptr &sample) { // Wrapper for determining which Sample type to construct. // Initialize Sample depending on how the alignment needs to be processed. if (bootstrap_iters > 0) { @@ -49,7 +49,7 @@ void ConstructSample(const telescope::Alignment &alignment, const size_t bootstr } } -void Sample::count_alignments(const telescope::Alignment &alignment) { +void Sample::count_alignments(const mSWEEP::Alignment &alignment) { // Count the number of aligned reads and store in counts_total size_t aln_counts_total = 0; #pragma omp parallel for schedule(static) reduction(+:aln_counts_total) diff --git a/src/mSWEEP.cpp b/src/mSWEEP.cpp index 04d666c..5cf5cea 100644 --- a/src/mSWEEP.cpp +++ b/src/mSWEEP.cpp @@ -34,11 +34,12 @@ #include "cxxargs.hpp" #include "Matrix.hpp" -#include "telescope.hpp" #include "cxxio.hpp" #include "rcgpar.hpp" #include "bin_reads.h" +#include "mSWEEP_alignment.hpp" + #include "mSWEEP_mpi_config.hpp" #include "mSWEEP_openmp_config.hpp" #include "mSWEEP_version.h" @@ -332,7 +333,8 @@ int main (int argc, char *argv[]) { // To save memory, the alignment can go out of scope. // The necessary values are stored in the Sample class. log << " reading pseudoalignments" << '\n'; - std::unique_ptr alignment; + mSWEEP::Alignment alignment(n_refs); + // std::unique_ptr alignment; try { const std::vector &alignment_paths = args.value>("themisto"); size_t n_files = alignment_paths.size(); @@ -350,16 +352,12 @@ int main (int argc, char *argv[]) { strands.emplace_back(&std::cin); } - // Read the pseudoalignment - if (n_groups <= std::numeric_limits::max()) { - telescope::read::ThemistoGrouped(telescope::get_mode(args.value("themisto-mode")), n_refs, static_cast*>(&(*reference))->get_group_indicators(i), strands, alignment); - } else if (n_groups <= std::numeric_limits::max()) { - telescope::read::ThemistoGrouped(telescope::get_mode(args.value("themisto-mode")), n_refs, static_cast*>(&(*reference))->get_group_indicators(i), strands, alignment); - } else if (n_groups <= std::numeric_limits::max()) { - telescope::read::ThemistoGrouped(telescope::get_mode(args.value("themisto-mode")), n_refs, static_cast*>(&(*reference))->get_group_indicators(i), strands, alignment); - } else { - telescope::read::ThemistoGrouped(telescope::get_mode(args.value("themisto-mode")), n_refs, static_cast*>(&(*reference))->get_group_indicators(i), strands, alignment); - } + alignment.read(args.value("themisto-mode"), strands); + log << " read alignments for " << alignment.n_reads() << " reads" << '\n'; + log << "Building equivalence classes" << '\n'; + alignment.collapse(); + log << " found " << alignment.n_ecs() << " unique alignments" << '\n'; + } catch (std::exception &e) { finalize("Reading the pseudoalignments failed:\n " + std::string(e.what()) + "\nexiting\n", log, true); return 1; @@ -367,7 +365,19 @@ int main (int argc, char *argv[]) { // Use the alignment data to populate the log_likelihoods matrix. try { - log_likelihoods = mSWEEP::ConstructAdaptiveLikelihood(*alignment, reference->get_grouping(i), args.value('q'), args.value('e'), args.value("min-hits"), args.value("zero-inflation")); + // Read the pseudoalignment + log << "Computing the likelihood matrix" << '\n'; + if (n_groups <= std::numeric_limits::max()) { + alignment.add_groups(static_cast*>(&(*reference))->get_group_indicators(i)); + } else if (n_groups <= std::numeric_limits::max()) { + alignment.add_groups(static_cast*>(&(*reference))->get_group_indicators(i)); + } else if (n_groups <= std::numeric_limits::max()) { + alignment.add_groups(static_cast*>(&(*reference))->get_group_indicators(i)); + } else { + alignment.add_groups(static_cast*>(&(*reference))->get_group_indicators(i)); + } + + log_likelihoods = mSWEEP::ConstructAdaptiveLikelihood(alignment, reference->get_grouping(i), args.value('q'), args.value('e'), args.value("min-hits"), args.value("zero-inflation")); } catch (std::exception &e) { finalize("Building the log-likelihood array failed:\n " + std::string(e.what()) + "\nexiting\n", log, true); return 1; @@ -375,9 +385,8 @@ int main (int argc, char *argv[]) { // Initialize Sample depending on how the alignment needs to be processed. // Note: this is also only used by the root process in MPI configuration. - mSWEEP::ConstructSample(*alignment, args.value("iters"), args.value("bootstrap-count"), args.value("seed"), bin_reads, sample); + mSWEEP::ConstructSample(alignment, args.value("iters"), args.value("bootstrap-count"), args.value("seed"), bin_reads, sample); - log << " read " << alignment->n_ecs() << " unique alignments" << '\n'; log.flush(); } else { try {