From 87e3a08b899e912cc3014daabc986f9c52999f38 Mon Sep 17 00:00:00 2001 From: Daniel Danciu Date: Tue, 24 Nov 2020 20:25:33 +0100 Subject: [PATCH] Support for multiple alphabets (protein+dna). Cleanups in tensor_sketch_main. Start testing seq2kmer. --- .gitmodules | 3 + CMakeLists.txt | 12 +++- experiments_main.cpp | 18 +++-- legacy/align_fasta.cpp | 4 +- legacy/cross_comp.cpp | 4 +- legacy/dists_pairwise.cpp | 4 +- legacy/long_seqs.cpp | 4 +- legacy/test_tensor_disc.cpp | 4 +- sketch/minhash.hpp | 4 +- sketch/omh.hpp | 2 +- sketch/tensor.hpp | 2 +- tensor_sketch_main.cpp | 111 +++++++++--------------------- tests/util/test_multivec.cpp | 2 +- tests/util/test_seqgen.cpp | 22 ++++++ third_party/gflags | 1 + util/alphabets.cpp | 91 ++++++++++++++++++++++++ util/alphabets.hpp | 16 +++++ util/args.hpp | 2 +- util/fasta.hpp | 60 ++++++++++++++++ util/modules.hpp | 8 ++- util/{multivec.h => multivec.hpp} | 2 +- util/seqgen.cpp | 2 +- util/{seqgen.h => seqgen.hpp} | 46 ++++++------- util/{timer.h => timer.cpp} | 19 ++--- util/timer.hpp | 22 ++++++ util/{utils.h => utils.hpp} | 41 +---------- 26 files changed, 320 insertions(+), 186 deletions(-) create mode 100644 tests/util/test_seqgen.cpp create mode 160000 third_party/gflags create mode 100644 util/alphabets.cpp create mode 100644 util/alphabets.hpp create mode 100644 util/fasta.hpp rename util/{multivec.h => multivec.hpp} (97%) rename util/{seqgen.h => seqgen.hpp} (89%) rename util/{timer.h => timer.cpp} (84%) create mode 100644 util/timer.hpp rename util/{utils.h => utils.hpp} (86%) diff --git a/.gitmodules b/.gitmodules index 712f95d..8b82325 100644 --- a/.gitmodules +++ b/.gitmodules @@ -4,3 +4,6 @@ [submodule "third_party/googletest"] path = third_party/googletest url = https://github.com/google/googletest +[submodule "third_party/gflags"] + path = third_party/gflags + url = https://github.com/gflags/gflags diff --git a/CMakeLists.txt b/CMakeLists.txt index 0ac434c..e90be03 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,11 +7,21 @@ add_compile_options(-O0) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O0") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O0") +# Google Flags Library +add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/gflags EXCLUDE_FROM_ALL) + +file(GLOB util_files "util/*.cpp") +add_library(util ${util_files}) +target_link_libraries(util gflags) + add_executable(exp_pairwise experiments_main.cpp) +target_link_libraries(exp_pairwise util) add_executable(sketch tensor_sketch_main.cpp) +target_link_libraries(sketch util) add_executable(seqgen util/seqgen.cpp) +target_link_libraries(seqgen util) # TESTS string(APPEND CMAKE_CXX_FLAGS " -Wall -Wextra -Werror") @@ -26,7 +36,7 @@ target_compile_options(gtest PRIVATE -w) file(GLOB test_files "tests/**/*.cpp") add_executable(tests ${test_files}) -target_link_libraries(tests gtest_main gtest gmock) +target_link_libraries(tests gtest_main gtest gmock util) target_include_directories(tests PRIVATE "include") gtest_discover_tests(tests) diff --git a/experiments_main.cpp b/experiments_main.cpp index ed32da5..d4fe104 100644 --- a/experiments_main.cpp +++ b/experiments_main.cpp @@ -1,9 +1,9 @@ #include "util/args.hpp" #include "util/modules.hpp" -#include "util/multivec.h" -#include "util/seqgen.h" -#include "util/timer.h" -#include "util/utils.h" +#include "util/multivec.hpp" +#include "util/seqgen.hpp" +#include "util/timer.hpp" +#include "util/utils.hpp" #include #include @@ -14,10 +14,7 @@ namespace fs = std::filesystem; using namespace ts; struct KmerModule : public BasicModule { - int original_sig_len {}; - void override_module_params() override { - original_sig_len = sig_len; sig_len = int_pow(sig_len, kmer_size); } }; @@ -115,15 +112,15 @@ struct SeqGenModule { } void compute_sketches() { - int num_seqs = seqs.size(); + size_t num_seqs = seqs.size(); kmer_seqs.resize(num_seqs); wmh_sketch.resize(num_seqs); mh_sketch.resize(num_seqs); omh_sketch.resize(num_seqs); ten_sketch.resize(num_seqs); slide_sketch.resize(num_seqs); - for (int si = 0; si < num_seqs; si++) { - seq2kmer(seqs[si], kmer_seqs[si], basicModules.kmer_size, basicModules.sig_len); + for (size_t si = 0; si < num_seqs; si++) { + kmer_seqs[si] = seq2kmer(seqs[si], basicModules.kmer_size, basicModules.sig_len); minhash(kmer_seqs[si], mh_sketch[si], kmerModules.mh_params); weighted_minhash(kmer_seqs[si], wmh_sketch[si], kmerModules.wmh_params); if (basicModules.tuple_on_kmer) { @@ -138,6 +135,7 @@ struct SeqGenModule { tensor_slide_sketch(seqs[si], slide_sketch[si], basicModules.tensor_slide_params); } } + void compute_pairwise_dists() { int num_seqs = seqs.size(); if (basicModules.mutation_pattern == "pairs") { diff --git a/legacy/align_fasta.cpp b/legacy/align_fasta.cpp index 955b3a0..f544490 100644 --- a/legacy/align_fasta.cpp +++ b/legacy/align_fasta.cpp @@ -3,8 +3,8 @@ #include "util/args.hpp" #include "util/modules.hpp" -#include "util/seqgen.h" -#include "util/utils.h" +#include "util/seqgen.hpp" +#include "util/utils.hpp" using namespace ts; using namespace BasicTypes; diff --git a/legacy/cross_comp.cpp b/legacy/cross_comp.cpp index 3acbb62..973e714 100644 --- a/legacy/cross_comp.cpp +++ b/legacy/cross_comp.cpp @@ -3,8 +3,8 @@ #include "util/args.hpp" #include "util/modules.hpp" -#include "util/seqgen.h" -#include "util/utils.h" +#include "util/seqgen.hpp" +#include "util/utils.hpp" using namespace ts; using namespace BasicTypes; diff --git a/legacy/dists_pairwise.cpp b/legacy/dists_pairwise.cpp index b52d09d..3b6f6b3 100644 --- a/legacy/dists_pairwise.cpp +++ b/legacy/dists_pairwise.cpp @@ -3,8 +3,8 @@ #include "util/args.hpp" #include "util/modules.hpp" -#include "util/seqgen.h" -#include "util/utils.h" +#include "util/seqgen.hpp" +#include "util/utils.hpp" using namespace ts; using namespace BasicTypes; diff --git a/legacy/long_seqs.cpp b/legacy/long_seqs.cpp index 86376be..0ce2eef 100644 --- a/legacy/long_seqs.cpp +++ b/legacy/long_seqs.cpp @@ -3,8 +3,8 @@ #include "util/args.hpp" #include "util/modules.hpp" -#include "util/seqgen.h" -#include "util/utils.h" +#include "util/seqgen.hpp" +#include "util/utils.hpp" using namespace ts; using namespace BasicTypes; diff --git a/legacy/test_tensor_disc.cpp b/legacy/test_tensor_disc.cpp index 63280a3..eb53e71 100644 --- a/legacy/test_tensor_disc.cpp +++ b/legacy/test_tensor_disc.cpp @@ -3,8 +3,8 @@ #include "util/args.hpp" #include "util/modules.hpp" -#include "util/seqgen.h" -#include "util/utils.h" +#include "util/seqgen.hpp" +#include "util/utils.hpp" using namespace ts; using namespace BasicTypes; diff --git a/sketch/minhash.hpp b/sketch/minhash.hpp index be9d5cf..e2f40aa 100644 --- a/sketch/minhash.hpp +++ b/sketch/minhash.hpp @@ -1,7 +1,7 @@ #pragma once -#include "util/timer.h" -#include "util/utils.h" +#include "util/timer.hpp" +#include "util/utils.hpp" #include #include diff --git a/sketch/omh.hpp b/sketch/omh.hpp index af255ed..99bdb32 100644 --- a/sketch/omh.hpp +++ b/sketch/omh.hpp @@ -1,6 +1,6 @@ #pragma once -#include "util/utils.h" +#include "util/utils.hpp" namespace ts { // ts = Tensor Sketch diff --git a/sketch/tensor.hpp b/sketch/tensor.hpp index dd2fb6a..ee985d6 100644 --- a/sketch/tensor.hpp +++ b/sketch/tensor.hpp @@ -3,7 +3,7 @@ #include #include -#include "util/multivec.h" +#include "util/multivec.hpp" namespace ts { // ts = Tensor Sketch diff --git a/tensor_sketch_main.cpp b/tensor_sketch_main.cpp index 3cd0f47..ea12a90 100644 --- a/tensor_sketch_main.cpp +++ b/tensor_sketch_main.cpp @@ -3,14 +3,16 @@ #include "util/args.hpp" #include "util/modules.hpp" -#include "util/seqgen.h" -#include "util/utils.h" +#include "util/seqgen.hpp" +#include "util/utils.hpp" #include +#include using namespace ts; template -struct SketchModule : public BasicModule { +class SketchModule : public BasicModule { + public: int original_sig_len {}; void override_module_params() override { @@ -20,8 +22,6 @@ struct SketchModule : public BasicModule { omh_params.max_len = win_len; } - void override_model_params() override { tensor_slide_params.embed_dim = embed_dim; } - SketchModule() { directory = "./"; output = "data/sketches/"; @@ -39,82 +39,15 @@ struct SketchModule : public BasicModule { sketch_method = "TenSlide"; } - Vec2D seqs; - Vec seq_names; - string test_id; - - - std::map chr2int - = { { 'a', 1 }, { 'c', 2 }, { 'g', 3 }, { 't', 4 }, { 'n', 0 }, - { 'A', 1 }, { 'C', 2 }, { 'G', 3 }, { 'T', 4 }, { 'N', 0 } }; - std::map chr2int_mask - = { { 'a', -1 }, { 'c', -2 }, { 'g', -3 }, { 't', -4 }, { 'n', 0 }, - { 'A', 1 }, { 'C', 2 }, { 'G', 3 }, { 'T', 4 }, { 'N', 0 } }; - void read_fasta() { - seqs.clear(); - string file = (directory + input); - std::ifstream infile = std::ifstream(file); - assert(infile.is_open()); - - string line; - std::getline(infile, line); - if (line[0] == '#') { - test_id = line; - std::getline(infile, line); - } - while (line[0] != '>') { - std::getline(infile, line); - } - string name = line; - Vec seq; - while (std::getline(infile, line)) { - if (line[0] == '>') { - seqs.push_back(seq); - seq_names.push_back(name); - seq.clear(); - name = line; - } else if (!line.empty()) { - if (format_input == "fasta") { - for (char c : line) { - seq.push_back(chr2int[c]); - } - } else if (format_input == "csv") { - std::stringstream ss(line); - string item; - while (std::getline(ss, item, ',')) { - seq.push_back(std::stoi(item, 0, 16)); - } - } else { - std::cerr << " input format `" << format_input << "` does not exist\n"; - exit(1); - } - } - } - } - - void sketch_slice(Seq seq, Vec &embed) { - if (sketch_method == "MH") { - minhash(seq, embed, mh_params); - } else if (sketch_method == "WMH") { - weighted_minhash(seq, embed, wmh_params); - } else if (sketch_method == "OMH") { - ordered_minhash_flat(seq, embed, omh_params); - } else if (sketch_method == "TenSketch") { - tensor_sketch(seq, embed, tensor_params); - } else { - std::cerr << "method not recognized\n"; - exit(1); - } + void read_input() { + std::tie(test_id, seqs, seq_names) = read_fasta(directory + input, format_input); } - Vec3D slide_sketch; void compute_sketches() { - // Vec args = {"MH", "WMH", "OMH", "TenSketch", "TenSlide"}; - int num_seqs = seqs.size(); + size_t num_seqs = seqs.size(); slide_sketch = new3D(seqs.size(), embed_dim, 0); - for (int si = 0; si < num_seqs; si++) { - Vec kmers; - seq2kmer(seqs[si], kmers, kmer_size, original_sig_len); + for (size_t si = 0; si < num_seqs; si++) { + Vec kmers = seq2kmer(seqs[si], kmer_size, original_sig_len); if (sketch_method == "TenSlide") { tensor_slide_sketch(kmers, slide_sketch[si], tensor_slide_params); } else { @@ -156,13 +89,35 @@ struct SketchModule : public BasicModule { } fo.close(); } + + private: + void sketch_slice(Seq seq, Vec &embed) { + if (sketch_method == "MH") { + minhash(seq, embed, mh_params); + } else if (sketch_method == "WMH") { + weighted_minhash(seq, embed, wmh_params); + } else if (sketch_method == "OMH") { + ordered_minhash_flat(seq, embed, omh_params); + } else if (sketch_method == "TenSketch") { + tensor_sketch(seq, embed, tensor_params); + } else { + std::cerr << "Unkknown method: " << sketch_method << std::endl; + exit(1); + } + } + + private: + Vec2D seqs; + Vec seq_names; + Vec3D slide_sketch; + string test_id; }; int main(int argc, char *argv[]) { SketchModule sketchModule; sketchModule.parse(argc, argv); sketchModule.models_init(); - sketchModule.read_fasta(); + sketchModule.read_input(); sketchModule.compute_sketches(); sketchModule.save_output(); } diff --git a/tests/util/test_multivec.cpp b/tests/util/test_multivec.cpp index 2a8901d..a4861df 100644 --- a/tests/util/test_multivec.cpp +++ b/tests/util/test_multivec.cpp @@ -1,4 +1,4 @@ -#include "util/multivec.h" +#include "util/multivec.hpp" #include diff --git a/tests/util/test_seqgen.cpp b/tests/util/test_seqgen.cpp new file mode 100644 index 0000000..eaa1440 --- /dev/null +++ b/tests/util/test_seqgen.cpp @@ -0,0 +1,22 @@ +#include "util/seqgen.hpp" + +#include + +#include + +namespace { +using namespace ts; + +template +class Seq2Kmer : public ::testing::Test {}; + +typedef ::testing::Types PowTypes; + +TYPED_TEST_SUITE(Seq2Kmer, PowTypes); + +TYPED_TEST(Seq2Kmer, Empty) { + Vec kmers = seq2kmer(Seq(), 31, 4); + ASSERT_EQ(0, kmers.size()); +} + +} // namespace diff --git a/third_party/gflags b/third_party/gflags new file mode 160000 index 0000000..827c769 --- /dev/null +++ b/third_party/gflags @@ -0,0 +1 @@ +Subproject commit 827c769e5fc98e0f2a34c47cef953cc6328abced diff --git a/util/alphabets.cpp b/util/alphabets.cpp new file mode 100644 index 0000000..0e31b41 --- /dev/null +++ b/util/alphabets.cpp @@ -0,0 +1,91 @@ +#include "alphabets.hpp" + +#include +#include +#include + +namespace ts { + +template +class log2 { + static constexpr size_t _log2(size_t x) { + if (x < 2) { + return 0; + } else { + return _log2(x >> 1) + 1; + } + } + + public: + static constexpr size_t value = _log2(n); +}; + +constexpr uint8_t alphabet_size_dna = 5; +constexpr char alphabet_dna[] = "ACGTN"; +constexpr uint8_t bits_per_char_dna = 3; +constexpr uint8_t char2int_tab_dna[128] + = { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 5, 2, 5, 5, 5, 3, 5, 5, 5, 5, 5, 5, + 0, 5, 5, 5, 5, 5, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 5, 2, 5, 5, 5, 3, + 5, 5, 5, 5, 5, 5, 0, 5, 5, 5, 5, 5, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5 }; + +inline uint32_t char2int_dna(uint8_t c) { + return char2int_tab_dna[c]; +} + +constexpr char alphabet_protein[] = "ABCDEFGHIJKLMNOPQRSTUVWYZX"; +constexpr uint8_t alphabet_size_protein = sizeof(alphabet_protein) - 1; +constexpr uint8_t bits_per_char_protein = log2::value + 1; +constexpr uint8_t char2int_tab_protein[128] + = { 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 0, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, + 25, 23, 24, 25, 25, 25, 25, 25, 25, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 25, 23, 24, 25, 25, 25, 25, 25 }; + +inline uint32_t char2int_protein(uint8_t c) { + return char2int_tab_protein[c]; +} + +enum class AlphabetType { DNA, Protein }; + +AlphabetType from_string(std::string str) { + std::transform(str.begin(), str.end(), str.begin(), + [](unsigned char c){ return std::tolower(c); }); + if (str == "dna") { + return AlphabetType::DNA; + } else if (str == "protein") { + return AlphabetType::Protein; + } else { + throw std::logic_error("Invalid alphabet type"); + } +} + +std::function char2int; +const char *alphabet; +uint8_t alphabet_size; +uint8_t bits_per_char; + +void init_alphabet(const std::string &alphabet_str) { + switch (from_string(alphabet_str)) { + case AlphabetType::DNA: + char2int = char2int_dna; + alphabet = alphabet_dna; + alphabet_size = alphabet_size_dna; + bits_per_char = bits_per_char_dna; + return; + case AlphabetType::Protein: + char2int = char2int_protein; + alphabet = alphabet_protein; + alphabet_size = alphabet_size_protein; + bits_per_char = bits_per_char_protein; + return; + default: + std::cerr << "Invalid alphabet type: " << alphabet_str << std::endl; + std::exit(1); + } +} + +} // namespace ts diff --git a/util/alphabets.hpp b/util/alphabets.hpp new file mode 100644 index 0000000..eb196d3 --- /dev/null +++ b/util/alphabets.hpp @@ -0,0 +1,16 @@ +#pragma once + +#include +#include +#include + +namespace ts { + +extern std::function char2int; +extern const char *alphabet; +extern uint8_t alphabet_size; +extern uint8_t bits_per_char; + +void init_alphabet(const std::string &alphabet_str); + +} // namespace ts diff --git a/util/args.hpp b/util/args.hpp index e19bb1f..fccab78 100644 --- a/util/args.hpp +++ b/util/args.hpp @@ -1,6 +1,6 @@ #pragma once -#include "util/utils.h" +#include "util/utils.hpp" #include #include diff --git a/util/fasta.hpp b/util/fasta.hpp new file mode 100644 index 0000000..e52453e --- /dev/null +++ b/util/fasta.hpp @@ -0,0 +1,60 @@ +#pragma once + +#include "util/alphabets.hpp" +#include "util/utils.hpp" + +#include + +namespace ts { // ts = Tensor Sketch + +/** + * Reads a fasta file and returns its contents as a tuple of . + * @tparam seq_type type used for storing a character of the fasta file, typically uint8_t + */ +template +std::tuple, Vec> +read_fasta(const std::string &file_name, const std::string &format_input) { + std::string test_id; + Vec2D seqs; + Vec seq_names; + std::ifstream infile(file_name); + assert(infile.is_open() && ("Could not open " + file_name).c_str()); + + std::string line; + std::getline(infile, line); + if (line[0] == '#') { + test_id = line; + std::getline(infile, line); + } + while (line[0] != '>') { + std::getline(infile, line); + } + std::string name = line; + Vec seq; + while (std::getline(infile, line)) { + if (line[0] == '>') { + seqs.push_back(seq); + seq_names.push_back(name); + seq.clear(); + name = line; + } else if (!line.empty()) { + if (format_input == "fasta") { + for (char c : line) { + seq.push_back(char2int(c)); + } + } else if (format_input == "csv") { + std::stringstream ss(line); + std::string item; + while (std::getline(ss, item, ',')) { + seq.push_back(std::stoi(item, 0, 16)); + } + } else { + std::cerr << "Invalid input foramt: " << format_input << std::endl; + exit(1); + } + } + } + return { test_id, seqs, seq_names }; +} + +} // namespace ts diff --git a/util/modules.hpp b/util/modules.hpp index fbd31f1..f1f5259 100644 --- a/util/modules.hpp +++ b/util/modules.hpp @@ -1,7 +1,8 @@ #pragma once +#include "util/alphabets.hpp" #include "util/args.hpp" -#include "util/seqgen.h" +#include "util/seqgen.hpp" #include "sketch/minhash.hpp" #include "sketch/omh.hpp" #include "sketch/tensor.hpp" @@ -10,6 +11,10 @@ #include "sketch/tuple.hpp" #include "sketch/wminhash.hpp" +#include + +DEFINE_string(alphabet, "dna", "Alphabet of the input data: DNA or Protein [DNA]"); + namespace ts { // ts = Tensor Sketch struct BasicModule : public ArgSet { @@ -91,6 +96,7 @@ struct BasicModule : public ArgSet { public: void models_init() { + init_alphabet(FLAGS_alphabet); override_module_params(); init_seqgen(seq_gen); init_mh_params(mh_params); diff --git a/util/multivec.h b/util/multivec.hpp similarity index 97% rename from util/multivec.h rename to util/multivec.hpp index b669002..dc61428 100644 --- a/util/multivec.h +++ b/util/multivec.hpp @@ -1,6 +1,6 @@ #pragma once -#include "util/utils.h" +#include "util/utils.hpp" namespace ts { // ts = Tensor Sketch template diff --git a/util/seqgen.cpp b/util/seqgen.cpp index b7979d9..07fcd97 100644 --- a/util/seqgen.cpp +++ b/util/seqgen.cpp @@ -3,7 +3,7 @@ #include "util/args.hpp" #include "util/modules.hpp" -#include "util/utils.h" +#include "util/utils.hpp" using namespace ts; diff --git a/util/seqgen.h b/util/seqgen.hpp similarity index 89% rename from util/seqgen.h rename to util/seqgen.hpp index 1c5a54a..b3a7d2b 100644 --- a/util/seqgen.h +++ b/util/seqgen.hpp @@ -11,40 +11,36 @@ namespace ts { // ts = Tensor Sketch using string = std::string; -// template -// void read_fasta(Vec2D &seqs, Vec &seq_names, const string &filename, const -// std::map &tr, int max_num_seqs = 1000) { -// std::ifstream infile(filename); -// string line; -// int si = -1; -// while (std::getline(infile, line) and seqs.size() <= max_num_seqs) { -// if (line[0] == '>') { -// si++; -// seqs.push_back(Vec()); -// seq_names.push_back(line); -// } else { -// for (char c : line) { -// seqs[si].push_back(tr[c]); -// } -// } -// } -// } - +/** + * Extracts kmers from a sequence. + * @tparam seq_type types of elements in the sequence + * @tparam embed_type type that stores a kmer + * @tparam size_type + * @param seq + * @param kmer_size + * @param sig_len + * @return + */ template -void seq2kmer(const Seq &seq, - Vec &vec, +Vec seq2kmer(const Seq &seq, size_type kmer_size, size_type sig_len) { - Timer::start("seq2kmer"); - vec = Vec(seq.size() - kmer_size + 1); - for (size_t i = 0; i < vec.size(); i++) { + if (seq.size() < (size_t)kmer_size) { + return Vec(); + } + Timer::start("seq2kmer");t + + Vec result(seq.size() - kmer_size + 1); + + for (size_t i = 0; i < result.size(); i++) { size_type c = 1; for (size_type j = 0; j < kmer_size; j++) { - vec[i] += c * seq[i + j]; + result[i] += c * seq[i + j]; c *= sig_len; } } Timer::stop(); + return result; } struct SeqGen { diff --git a/util/timer.h b/util/timer.cpp similarity index 84% rename from util/timer.h rename to util/timer.cpp index ccdef80..8b3b28b 100644 --- a/util/timer.h +++ b/util/timer.cpp @@ -1,21 +1,14 @@ -#pragma once - -#include -#include -#include -#include -#include - -namespace ts { // ts = Tensor Sketch +#include "timer.hpp" +namespace ts { namespace Timer { -using namespace std::chrono; -std::map durations; - -auto last_time = high_resolution_clock::now(); +using namespace std::chrono; std::string last_func; +auto last_time = std::chrono::high_resolution_clock::now(); + +std::map durations; void start(std::string func_name) { assert(last_func.empty()); diff --git a/util/timer.hpp b/util/timer.hpp new file mode 100644 index 0000000..5f3cf62 --- /dev/null +++ b/util/timer.hpp @@ -0,0 +1,22 @@ +#pragma once + +#include +#include +#include +#include +#include + +namespace ts { // ts = Tensor Sketch + +namespace Timer { + +extern std::map durations; + +void start(std::string func_name); + +void stop(); + +std::string summary(); + +} // namespace Timer +} // namespace ts diff --git a/util/utils.h b/util/utils.hpp similarity index 86% rename from util/utils.h rename to util/utils.hpp index aab6938..bc80512 100644 --- a/util/utils.h +++ b/util/utils.hpp @@ -1,6 +1,6 @@ #pragma once -#include "util/timer.h" +#include "util/timer.hpp" #include #include @@ -252,43 +252,4 @@ size_t edit_distance(const Seq &s1, const Seq &s2) { return result; } -size_t edit_distance(const std::string &s1, const std::string &s2) { - const size_t m(s1.size()); - const size_t n(s2.size()); - - if (m == 0) - return n; - if (n == 0) - return m; - - auto costs = new size_t[n + 1]; - - for (size_t k = 0; k <= n; k++) - costs[k] = k; - - size_t i = 0; - for (auto it1 = s1.begin(); it1 != s1.end(); ++it1, ++i) { - costs[0] = i + 1; - size_t corner = i; - - size_t j = 0; - for (auto it2 = s2.begin(); it2 != s2.end(); ++it2, ++j) { - size_t upper = costs[j + 1]; - if (*it1 == *it2) { - costs[j + 1] = corner; - } else { - size_t t(upper < corner ? upper : corner); - costs[j + 1] = (costs[j] < t ? costs[j] : t) + 1; - } - - corner = upper; - } - } - - size_t result = costs[n]; - delete[] costs; - - return result; -} - } // namespace ts