Skip to content

Commit

Permalink
Merge pull request #17 from PROBIC/mpi-support
Browse files Browse the repository at this point in the history
mSWEEP-v1.6.1 (5 May 2022)
  • Loading branch information
tmaklin authored May 6, 2022
2 parents 91b2467 + d173522 commit 52446f8
Show file tree
Hide file tree
Showing 26 changed files with 691 additions and 1,138 deletions.
46 changes: 42 additions & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,20 @@ endif()
set (CMAKE_CXX_STANDARD 11)
set (CMAKE_CXX_STANDARD_REQUIRED ON)

### MPI
find_package(MPI)
if (MPI_FOUND)
set(MSWEEP_MPI_SUPPORT 1)
include_directories(MPI_C_INCLUDE_DIRS)
if (CMAKE_MPI_MAX_PROCESSES)
set(MSWEEP_MPI_MAX_PROCESSES ${CMAKE_MPI_MAX_PROCESSES})
else()
set(MSWEEP_MPI_MAX_PROCESSES 1024)
endif()
else()
set(MSWEEP_MPI_SUPPORT 0)
endif()

set(LIBRARY_OUTPUT_PATH ${CMAKE_BINARY_DIR}/lib)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)

Expand All @@ -59,6 +73,8 @@ string(TIMESTAMP _BUILD_TIMESTAMP)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/config/version.h.in ${CMAKE_BINARY_DIR}/include/version.h @ONLY)
## Configure OpenMP if it supported on the system.
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/config/openmp_config.hpp.in ${CMAKE_BINARY_DIR}/include/openmp_config.hpp @ONLY)
## Configure MPI if it's supported on the system.
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/config/mpi_config.hpp.in ${CMAKE_CURRENT_BINARY_DIR}/include/mpi_config.hpp @ONLY)

add_executable(mSWEEP
${CMAKE_SOURCE_DIR}/src/BootstrapSample.cpp
Expand All @@ -67,11 +83,8 @@ ${CMAKE_SOURCE_DIR}/src/Grouping.cpp
${CMAKE_SOURCE_DIR}/src/Sample.cpp
${CMAKE_SOURCE_DIR}/src/likelihood.cpp
${CMAKE_SOURCE_DIR}/src/mSWEEP.cpp
${CMAKE_SOURCE_DIR}/src/matrix.cpp
${CMAKE_SOURCE_DIR}/src/parse_arguments.cpp
${CMAKE_SOURCE_DIR}/src/process_reads.cpp
${CMAKE_SOURCE_DIR}/src/rcg.cpp
${CMAKE_SOURCE_DIR}/src/read_pseudoalignment.cpp)
${CMAKE_SOURCE_DIR}/src/main.cpp)

## Check supported compression types
find_package(BZip2)
Expand Down Expand Up @@ -199,6 +212,31 @@ else()
endif()
include_directories(${CMAKE_TELESCOPE_HEADERS})

## rcgpar
if (DEFINED CMAKE_RCGPAR_LIBRARY AND DEFINED CMAKE_RCGPAR_HEADERS)
find_library(RCGPAR NAMES rcgpar HINTS ${CMAKE_RCGPAR_LIBRARY})
target_link_libraries(mSWEEP ${RCGPAR})
else()
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/config/CMakeLists-rcgpar.txt.in ${CMAKE_BINARY_DIR}/external/rcgpar-download/CMakeLists.txt)
execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
RESULT_VARIABLE result
WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/external/rcgpar-download )
if(result)
message(FATAL_ERROR "CMake step for rcgpar failed: ${result}")
endif()
execute_process(COMMAND ${CMAKE_COMMAND} --build .
RESULT_VARIABLE result
WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/external/rcgpar-download )
if(result)
message(FATAL_ERROR "Build step for rcgpar failed: ${result}")
endif()
add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/external/rcgpar
${CMAKE_CURRENT_BINARY_DIR}/external/rcgpar/build)
target_link_libraries(mSWEEP rcgomp)
set(CMAKE_RCGPAR_HEADERS ${CMAKE_CURRENT_BINARY_DIR}/external/rcgpar/include)
endif()
include_directories(${CMAKE_RCGPAR_HEADERS})

include_directories(${CMAKE_SOURCE_DIR}/include
${CMAKE_SOURCE_DIR}/external ${CMAKE_SOURCE_DIR}/include/tools
${CMAKE_SOURCE_DIR}/external/cxxio
Expand Down
46 changes: 42 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ from the mSWEEP paper [in YouTube](https://www.youtube.com/watch?v=VDfChoJwSKg).

# Installation
mSWEEP can be obtained either in the form of a precompiled binary
* [Linux 64-bit binary](https://github.com/PROBIC/mSWEEP/releases/download/v1.5.2/mSWEEP_linux-v1.5.2.tar.gz)
* [macOS 64-bit binary](https://github.com/PROBIC/mSWEEP/releases/download/v1.5.2/mSWEEP_macOS-v1.5.2.tar.gz)
* [Linux 64-bit binary](https://github.com/PROBIC/mSWEEP/releases/download/v1.6.1/mSWEEP_linux-v1.6.1.tar.gz)
* [macOS 64-bit binary](https://github.com/PROBIC/mSWEEP/releases/download/v1.6.1/mSWEEP_macOS-v1.6.1.tar.gz)
or by following the instructions below for compiling mSWEEP from source.

In addition to mSWEEP, you will need to install either [Themisto
Expand Down Expand Up @@ -61,7 +61,41 @@ enter the directory and run
```
- This will compile the mSWEEP executable in build/bin/mSWEEP.

### Compilation tips for improving performance
#### MPI support
mSWEEP can be compiled with MPI support, distributing the mixture
component estimation part of the program to several processes. To
compile with MPI support, set your environment appropriately and build
mSWEEP with the following commands (example case for Open MPI):
```
> mkdir build
> cd build
> module load mpi/openmpi
> cmake -DCMAKE_C_COMPILER=mpicc -DCMAKE_CXX_COMPILER=mpicxx ..
> make
```

The project should configure itself appropriately. To distribute the
computation to 4 processes after compiling, call mSWEEP with:
```
mpirun -np 4 mSWEEP --themisto-1 forward_aln.gz --themisto-2 reverse_aln.gz -i cluster_indicators.txt
```

In some cases it might be useful to use hybrid parallellization with
multiple threads per process. This can be accomplished through use of
the `-t` flag:
```
mpirun -np 2 mSWEEP --themisto-1 forward_aln.gz --themisto-2 reverse_aln.gz -i cluster_indicators.txt -t 2
```

which will distribute computation to two processes with two
threads. The optimal configuration will depend on the size and
structure of your data.

Note that when mSWEEP is called through MPI, the root process will
handle all read and write operations and only the estimation part is
distributed.

#### Compilation tips for improving performance
1. If you intend to run mSWEEP on the machine used in compiling the
source code, you might want to add the '-march=native -mtune=native'
flags if compiling with GCC by running
Expand Down Expand Up @@ -427,13 +461,17 @@ mSWEEP accepts the following flags:
--write-probs
If specified, write the read equivalence class probabilities in a .csv matrix
--print-probs
Print the equivalence class probabilities rather than writing when using --write-probs
Print the read equivalence class probabilities to cout
--write-likelihood
Write the likelihood matrix to a file with "_likelihoods.txt" suffix if -o option is specified, print to cout if -o is not.
--write-likelihood-bitseq
Write the likelihoods in a format can be parsed by BitSeq's (https://github.com/bitseq/bitseq) functions.
--gzip-probs
Gzip the .csv matrix output from --write-probs and the likelihoods from --write-likelihood or --write-likelihood-bitseq.
--read-likelihood
Read in a likelihood matrix that has been written to file with the --write-likelihood toggle.
--help
Print this message.
--version
Expand Down
3 changes: 2 additions & 1 deletion config/CMakeLists-bxzstr.txt.in
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,11 @@ include(ExternalProject)

ExternalProject_Add(bxzstr-download
GIT_REPOSITORY https://github.com/tmaklin/bxzstr.git
GIT_TAG master
GIT_TAG v1.1.0
SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/external/bxzstr"
BUILD_IN_SOURCE 0
BUILD_COMMAND ""
CMAKE_ARGS -D ZSTD_FOUND=0
INSTALL_COMMAND ""
TEST_COMMAND ""
UPDATE_COMMAND ""
Expand Down
15 changes: 15 additions & 0 deletions config/CMakeLists-rcgpar.txt.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
cmake_minimum_required(VERSION 2.8.2)

project(rcgpar-get NONE)
include(ExternalProject)

ExternalProject_Add(rcgpar-download
GIT_REPOSITORY https://github.com/tmaklin/rcgpar
GIT_TAG v1.0.0
SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/external/rcgpar"
BUILD_IN_SOURCE 0
BUILD_COMMAND ""
INSTALL_COMMAND ""
TEST_COMMAND ""
UPDATE_COMMAND ""
)
12 changes: 12 additions & 0 deletions config/mpi_config.hpp.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#ifndef MSWEEP_MPI_CONFIG_HPP
#define MSWEEP_MPI_CONFIG_HPP

#define MSWEEP_MPI_SUPPORT @MSWEEP_MPI_SUPPORT@

#if defined(MSWEEP_MPI_SUPPORT) && (MSWEEP_MPI_SUPPORT) == 1
#define OMPI_SKIP_MPICXX 1 // See https://github.com/open-mpi/ompi/issues/5157
#include <mpi.h>
#endif


#endif
3 changes: 2 additions & 1 deletion include/Grouping.hpp
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
#ifndef MSWEEP_GROUPING_HPP
#define MSWEEP_GROUPING_HPP

#include <cstddef>
#include <vector>
#include <array>
#include <string>
#include <array>
#include <unordered_map>

class Grouping {
Expand Down
3 changes: 2 additions & 1 deletion include/Reference.hpp
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
#ifndef MSWEEP_REFERENCE_HPP
#define MSWEEP_REFERENCE_HPP

#include <cstddef>
#include <vector>
#include <fstream>
#include <string>
#include <fstream>

#include "cxxio.hpp"

Expand Down
90 changes: 45 additions & 45 deletions include/Sample.hpp
Original file line number Diff line number Diff line change
@@ -1,86 +1,86 @@
#ifndef MSWEEP_SAMPLE_HPP
#define MSWEEP_SAMPLE_HPP

#include <cstddef>
#include <string>
#include <vector>
#include <fstream>
#include <random>

#include "telescope.hpp"
#include "Matrix.hpp"

#include "matrix.hpp"
#include "Reference.hpp"
#include "Grouping.hpp"
#include "parse_arguments.hpp"

class VSample {
public:
virtual void read_themisto(const Mode &mode, const uint32_t n_refs, std::vector<std::istream*> &strands) =0;
virtual void read_kallisto(const uint32_t n_refs, std::istream &tsv_file, std::istream &ec_file) =0;
};

class Sample : public VSample{
class Sample {
private:
uint32_t m_num_refs;
uint32_t m_num_ecs;
std::string cell_id;

// Count the number of pseudoalignments in groups defined by the given indicators.
std::vector<uint16_t> group_counts(const std::vector<uint32_t> indicators, const uint32_t ec_id, const uint32_t n_groups) const;

// Free the memory taken by ec_configs
void clear_configs() { pseudos.ec_configs.clear(); }
public:
uint32_t counts_total;

protected:
// Calculate log_ec_counts and counts_total.
void process_aln();
rcgpar::Matrix<double> ll_mat;
rcgpar::Matrix<double> ec_probs;
std::vector<double> log_ec_counts;
std::vector<double> relative_abundances;

// Alignments class from telescope
KallistoAlignment pseudos;
uint32_t counts_total;

public:
Matrix<double> ec_probs;
Matrix<double> ll_mat;
std::vector<std::vector<uint16_t>> counts;
std::vector<double> log_ec_counts;
// Calculate log_ec_counts and counts_total.
void process_aln(const bool bootstrap_mode);

// Count the number of pseudoalignments in groups defined by the given indicators.
std::vector<uint16_t> group_counts(const std::vector<uint32_t> indicators, const uint32_t ec_id, const uint32_t n_groups) const;

// Retrieve relative abundances from the ec_probs matrix.
std::vector<double> group_abundances() const;
// Write estimated relative abundances
void write_abundances(const std::vector<std::string> &cluster_indicators_to_string, std::string outfile) const;
void write_abundances(const std::vector<std::string> &cluster_indicators_to_string, std::ostream &of) const;
// Write estimated read-reference posterior probabilities (gamma_Z)
void write_probabilities(const std::vector<std::string> &cluster_indicators_to_string, std::ostream &outfile) const;
// Write likelihoods
void write_likelihood(const bool gzip_output, const uint32_t n_groups, std::string outfile) const;
void write_likelihood_bitseq(const bool gzip_output, const uint32_t n_groups, std::string outfile) const;
void write_likelihood(const uint32_t n_groups, std::ostream &of) const;
// Write likelihoods in BitSeq-compatible format
void write_likelihood_bitseq(const uint32_t n_groups, std::ostream &of) const;

// Read in the likelihoods from a file
void read_likelihood(const Grouping &grouping, std::istream &infile);

// Getters
std::string cell_name() const { return cell_id; };
uint32_t num_ecs() const { return m_num_ecs; };
uint32_t total_counts() const { return counts_total; };

// Read Themisto or kallisto pseudoalignments
void read_themisto(const Mode &mode, const uint32_t n_refs, std::vector<std::istream*> &strands) override;
void read_kallisto(const uint32_t n_refs, std::istream &tsv_file, std::istream &ec_file) override;
// Fill the likelihood matrix
void CalcLikelihood(const Grouping &grouping, const double bb_constants[2], const std::vector<uint32_t> &group_indicators, const bool cleanup);
uint32_t get_counts_total() const { return this->counts_total; };
};

class BootstrapSample : public Sample {
private:
std::mt19937_64 gen;
std::discrete_distribution<uint32_t> ec_distribution;
std::vector<std::vector<double>> relative_abundances;

// Run estimation and add results to relative_abundances
void BootstrapIter(const std::vector<double> &alpha0, const double tolerance, const uint16_t max_iters);
// Resample the equivalence class counts
void ResampleCounts(const uint32_t how_many, std::mt19937_64 &rng);
void bootstrap_iter(const std::vector<double> &resampled_log_ec_counts,
const std::vector<double> &alpha0, const double tolerance,
const uint16_t max_iters);

public:
void WriteBootstrap(const std::vector<std::string> &cluster_indicators_to_string, std::string &outfile, const unsigned iters, const bool batch_mode) const;
void BootstrapAbundances(const Grouping &grouping, const Arguments &args);
// Set seed in constructor
BootstrapSample(const int32_t seed);

std::vector<std::vector<double>> bootstrap_results;

// Resample the equivalence class counts
std::vector<double> resample_counts(const uint32_t how_many);

void init_bootstrap();

// Estimate the mixture components with bootstrap iterations
void estimate_abundances(const Arguments &args);

void write_bootstrap(const std::vector<std::string> &cluster_indicators_to_string,
const uint16_t iters, std::ostream &of) const;
void bootstrap_ec_counts(const Arguments &args);

// Read in pseudoalignments but do not free the memory used by storing the equivalence class counts.
void read_themisto(const Mode &mode, const uint32_t n_refs, std::vector<std::istream*> &strands) override;
void read_kallisto(const uint32_t n_refs, std::istream &tsv_file, std::istream &ec_file) override;
};

#endif
6 changes: 3 additions & 3 deletions include/likelihood.hpp
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
#ifndef MSWEEP_LIKELIHOOD_HPP
#define MSWEEP_LIKELIHOOD_HPP

#include "matrix.hpp"
#include "Reference.hpp"
#include "Grouping.hpp"
#include "Sample.hpp"

void precalc_lls(const Grouping &grouping, const double bb_constants[2], Matrix<double> *ll_mat);
void likelihood_array_mat(const Grouping &grouping, const std::vector<uint32_t> &group_indicators, const double bb_constants[2], Sample &sample);

#endif
Loading

0 comments on commit 52446f8

Please sign in to comment.