Skip to content

Commit

Permalink
Support for multiple alphabets (protein+dna). Cleanups in tensor_sket…
Browse files Browse the repository at this point in the history
…ch_main. Start testing seq2kmer.
  • Loading branch information
danieldanciu committed Nov 24, 2020
1 parent d5d946e commit 87e3a08
Show file tree
Hide file tree
Showing 26 changed files with 320 additions and 186 deletions.
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,6 @@
[submodule "third_party/googletest"]
path = third_party/googletest
url = https://github.com/google/googletest
[submodule "third_party/gflags"]
path = third_party/gflags
url = https://github.com/gflags/gflags
12 changes: 11 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,21 @@ add_compile_options(-O0)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O0")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O0")

# Google Flags Library
add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/gflags EXCLUDE_FROM_ALL)

file(GLOB util_files "util/*.cpp")
add_library(util ${util_files})
target_link_libraries(util gflags)

add_executable(exp_pairwise experiments_main.cpp)
target_link_libraries(exp_pairwise util)

add_executable(sketch tensor_sketch_main.cpp)
target_link_libraries(sketch util)

add_executable(seqgen util/seqgen.cpp)
target_link_libraries(seqgen util)

# TESTS
string(APPEND CMAKE_CXX_FLAGS " -Wall -Wextra -Werror")
Expand All @@ -26,7 +36,7 @@ target_compile_options(gtest PRIVATE -w)
file(GLOB test_files "tests/**/*.cpp")

add_executable(tests ${test_files})
target_link_libraries(tests gtest_main gtest gmock)
target_link_libraries(tests gtest_main gtest gmock util)
target_include_directories(tests PRIVATE "include")

gtest_discover_tests(tests)
Expand Down
18 changes: 8 additions & 10 deletions experiments_main.cpp
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
#include "util/args.hpp"
#include "util/modules.hpp"
#include "util/multivec.h"
#include "util/seqgen.h"
#include "util/timer.h"
#include "util/utils.h"
#include "util/multivec.hpp"
#include "util/seqgen.hpp"
#include "util/timer.hpp"
#include "util/utils.hpp"

#include <filesystem>
#include <fstream>
Expand All @@ -14,10 +14,7 @@ namespace fs = std::filesystem;
using namespace ts;

struct KmerModule : public BasicModule {
int original_sig_len {};

void override_module_params() override {
original_sig_len = sig_len;
sig_len = int_pow<size_t>(sig_len, kmer_size);
}
};
Expand Down Expand Up @@ -115,15 +112,15 @@ struct SeqGenModule {
}

void compute_sketches() {
int num_seqs = seqs.size();
size_t num_seqs = seqs.size();
kmer_seqs.resize(num_seqs);
wmh_sketch.resize(num_seqs);
mh_sketch.resize(num_seqs);
omh_sketch.resize(num_seqs);
ten_sketch.resize(num_seqs);
slide_sketch.resize(num_seqs);
for (int si = 0; si < num_seqs; si++) {
seq2kmer(seqs[si], kmer_seqs[si], basicModules.kmer_size, basicModules.sig_len);
for (size_t si = 0; si < num_seqs; si++) {
kmer_seqs[si] = seq2kmer<seq_type, seq_type>(seqs[si], basicModules.kmer_size, basicModules.sig_len);
minhash(kmer_seqs[si], mh_sketch[si], kmerModules.mh_params);
weighted_minhash(kmer_seqs[si], wmh_sketch[si], kmerModules.wmh_params);
if (basicModules.tuple_on_kmer) {
Expand All @@ -138,6 +135,7 @@ struct SeqGenModule {
tensor_slide_sketch(seqs[si], slide_sketch[si], basicModules.tensor_slide_params);
}
}

void compute_pairwise_dists() {
int num_seqs = seqs.size();
if (basicModules.mutation_pattern == "pairs") {
Expand Down
4 changes: 2 additions & 2 deletions legacy/align_fasta.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@

#include "util/args.hpp"
#include "util/modules.hpp"
#include "util/seqgen.h"
#include "util/utils.h"
#include "util/seqgen.hpp"
#include "util/utils.hpp"

using namespace ts;
using namespace BasicTypes;
Expand Down
4 changes: 2 additions & 2 deletions legacy/cross_comp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@

#include "util/args.hpp"
#include "util/modules.hpp"
#include "util/seqgen.h"
#include "util/utils.h"
#include "util/seqgen.hpp"
#include "util/utils.hpp"

using namespace ts;
using namespace BasicTypes;
Expand Down
4 changes: 2 additions & 2 deletions legacy/dists_pairwise.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@

#include "util/args.hpp"
#include "util/modules.hpp"
#include "util/seqgen.h"
#include "util/utils.h"
#include "util/seqgen.hpp"
#include "util/utils.hpp"

using namespace ts;
using namespace BasicTypes;
Expand Down
4 changes: 2 additions & 2 deletions legacy/long_seqs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@

#include "util/args.hpp"
#include "util/modules.hpp"
#include "util/seqgen.h"
#include "util/utils.h"
#include "util/seqgen.hpp"
#include "util/utils.hpp"

using namespace ts;
using namespace BasicTypes;
Expand Down
4 changes: 2 additions & 2 deletions legacy/test_tensor_disc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@

#include "util/args.hpp"
#include "util/modules.hpp"
#include "util/seqgen.h"
#include "util/utils.h"
#include "util/seqgen.hpp"
#include "util/utils.hpp"

using namespace ts;
using namespace BasicTypes;
Expand Down
4 changes: 2 additions & 2 deletions sketch/minhash.hpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#pragma once

#include "util/timer.h"
#include "util/utils.h"
#include "util/timer.hpp"
#include "util/utils.hpp"

#include <cstdint>
#include <random>
Expand Down
2 changes: 1 addition & 1 deletion sketch/omh.hpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#pragma once

#include "util/utils.h"
#include "util/utils.hpp"

namespace ts { // ts = Tensor Sketch

Expand Down
2 changes: 1 addition & 1 deletion sketch/tensor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
#include <cmath>
#include <cstdio>

#include "util/multivec.h"
#include "util/multivec.hpp"

namespace ts { // ts = Tensor Sketch

Expand Down
111 changes: 33 additions & 78 deletions tensor_sketch_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,16 @@

#include "util/args.hpp"
#include "util/modules.hpp"
#include "util/seqgen.h"
#include "util/utils.h"
#include "util/seqgen.hpp"
#include "util/utils.hpp"
#include <sstream>
#include <util/fasta.hpp>

using namespace ts;

template <typename seq_type, class embed_type>
struct SketchModule : public BasicModule {
class SketchModule : public BasicModule {
public:
int original_sig_len {};

void override_module_params() override {
Expand All @@ -20,8 +22,6 @@ struct SketchModule : public BasicModule {
omh_params.max_len = win_len;
}

void override_model_params() override { tensor_slide_params.embed_dim = embed_dim; }

SketchModule() {
directory = "./";
output = "data/sketches/";
Expand All @@ -39,82 +39,15 @@ struct SketchModule : public BasicModule {
sketch_method = "TenSlide";
}

Vec2D<seq_type> seqs;
Vec<std::string> seq_names;
string test_id;


std::map<char, seq_type> chr2int
= { { 'a', 1 }, { 'c', 2 }, { 'g', 3 }, { 't', 4 }, { 'n', 0 },
{ 'A', 1 }, { 'C', 2 }, { 'G', 3 }, { 'T', 4 }, { 'N', 0 } };
std::map<char, seq_type> chr2int_mask
= { { 'a', -1 }, { 'c', -2 }, { 'g', -3 }, { 't', -4 }, { 'n', 0 },
{ 'A', 1 }, { 'C', 2 }, { 'G', 3 }, { 'T', 4 }, { 'N', 0 } };
void read_fasta() {
seqs.clear();
string file = (directory + input);
std::ifstream infile = std::ifstream(file);
assert(infile.is_open());

string line;
std::getline(infile, line);
if (line[0] == '#') {
test_id = line;
std::getline(infile, line);
}
while (line[0] != '>') {
std::getline(infile, line);
}
string name = line;
Vec<seq_type> seq;
while (std::getline(infile, line)) {
if (line[0] == '>') {
seqs.push_back(seq);
seq_names.push_back(name);
seq.clear();
name = line;
} else if (!line.empty()) {
if (format_input == "fasta") {
for (char c : line) {
seq.push_back(chr2int[c]);
}
} else if (format_input == "csv") {
std::stringstream ss(line);
string item;
while (std::getline(ss, item, ',')) {
seq.push_back(std::stoi(item, 0, 16));
}
} else {
std::cerr << " input format `" << format_input << "` does not exist\n";
exit(1);
}
}
}
}

void sketch_slice(Seq<seq_type> seq, Vec<embed_type> &embed) {
if (sketch_method == "MH") {
minhash(seq, embed, mh_params);
} else if (sketch_method == "WMH") {
weighted_minhash(seq, embed, wmh_params);
} else if (sketch_method == "OMH") {
ordered_minhash_flat(seq, embed, omh_params);
} else if (sketch_method == "TenSketch") {
tensor_sketch(seq, embed, tensor_params);
} else {
std::cerr << "method not recognized\n";
exit(1);
}
void read_input() {
std::tie(test_id, seqs, seq_names) = read_fasta<seq_type>(directory + input, format_input);
}

Vec3D<embed_type> slide_sketch;
void compute_sketches() {
// Vec<string> args = {"MH", "WMH", "OMH", "TenSketch", "TenSlide"};
int num_seqs = seqs.size();
size_t num_seqs = seqs.size();
slide_sketch = new3D<embed_type>(seqs.size(), embed_dim, 0);
for (int si = 0; si < num_seqs; si++) {
Vec<seq_type> kmers;
seq2kmer(seqs[si], kmers, kmer_size, original_sig_len);
for (size_t si = 0; si < num_seqs; si++) {
Vec<seq_type> kmers = seq2kmer<seq_type, seq_type>(seqs[si], kmer_size, original_sig_len);
if (sketch_method == "TenSlide") {
tensor_slide_sketch(kmers, slide_sketch[si], tensor_slide_params);
} else {
Expand Down Expand Up @@ -156,13 +89,35 @@ struct SketchModule : public BasicModule {
}
fo.close();
}

private:
void sketch_slice(Seq<seq_type> seq, Vec<embed_type> &embed) {
if (sketch_method == "MH") {
minhash(seq, embed, mh_params);
} else if (sketch_method == "WMH") {
weighted_minhash(seq, embed, wmh_params);
} else if (sketch_method == "OMH") {
ordered_minhash_flat(seq, embed, omh_params);
} else if (sketch_method == "TenSketch") {
tensor_sketch(seq, embed, tensor_params);
} else {
std::cerr << "Unkknown method: " << sketch_method << std::endl;
exit(1);
}
}

private:
Vec2D<seq_type> seqs;
Vec<std::string> seq_names;
Vec3D<embed_type> slide_sketch;
string test_id;
};

int main(int argc, char *argv[]) {
SketchModule<int, double> sketchModule;
sketchModule.parse(argc, argv);
sketchModule.models_init();
sketchModule.read_fasta();
sketchModule.read_input();
sketchModule.compute_sketches();
sketchModule.save_output();
}
2 changes: 1 addition & 1 deletion tests/util/test_multivec.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#include "util/multivec.h"
#include "util/multivec.hpp"

#include <gtest/gtest.h>

Expand Down
22 changes: 22 additions & 0 deletions tests/util/test_seqgen.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#include "util/seqgen.hpp"

#include <gtest/gtest.h>

#include <random>

namespace {
using namespace ts;

template <typename T>
class Seq2Kmer : public ::testing::Test {};

typedef ::testing::Types<uint64_t, uint32_t> PowTypes;

TYPED_TEST_SUITE(Seq2Kmer, PowTypes);

TYPED_TEST(Seq2Kmer, Empty) {
Vec<int> kmers = seq2kmer<int, int>(Seq<int>(), 31, 4);
ASSERT_EQ(0, kmers.size());
}

} // namespace
1 change: 1 addition & 0 deletions third_party/gflags
Submodule gflags added at 827c76
Loading

0 comments on commit 87e3a08

Please sign in to comment.