From 87e3a08b899e912cc3014daabc986f9c52999f38 Mon Sep 17 00:00:00 2001
From: Daniel Danciu
Date: Tue, 24 Nov 2020 20:25:33 +0100
Subject: [PATCH] Support for multiple alphabets (protein+dna). Cleanups in
tensor_sketch_main. Start testing seq2kmer.
---
.gitmodules | 3 +
CMakeLists.txt | 12 +++-
experiments_main.cpp | 18 +++--
legacy/align_fasta.cpp | 4 +-
legacy/cross_comp.cpp | 4 +-
legacy/dists_pairwise.cpp | 4 +-
legacy/long_seqs.cpp | 4 +-
legacy/test_tensor_disc.cpp | 4 +-
sketch/minhash.hpp | 4 +-
sketch/omh.hpp | 2 +-
sketch/tensor.hpp | 2 +-
tensor_sketch_main.cpp | 111 +++++++++---------------------
tests/util/test_multivec.cpp | 2 +-
tests/util/test_seqgen.cpp | 22 ++++++
third_party/gflags | 1 +
util/alphabets.cpp | 91 ++++++++++++++++++++++++
util/alphabets.hpp | 16 +++++
util/args.hpp | 2 +-
util/fasta.hpp | 60 ++++++++++++++++
util/modules.hpp | 8 ++-
util/{multivec.h => multivec.hpp} | 2 +-
util/seqgen.cpp | 2 +-
util/{seqgen.h => seqgen.hpp} | 46 ++++++-------
util/{timer.h => timer.cpp} | 19 ++---
util/timer.hpp | 22 ++++++
util/{utils.h => utils.hpp} | 41 +----------
26 files changed, 320 insertions(+), 186 deletions(-)
create mode 100644 tests/util/test_seqgen.cpp
create mode 160000 third_party/gflags
create mode 100644 util/alphabets.cpp
create mode 100644 util/alphabets.hpp
create mode 100644 util/fasta.hpp
rename util/{multivec.h => multivec.hpp} (97%)
rename util/{seqgen.h => seqgen.hpp} (89%)
rename util/{timer.h => timer.cpp} (84%)
create mode 100644 util/timer.hpp
rename util/{utils.h => utils.hpp} (86%)
diff --git a/.gitmodules b/.gitmodules
index 712f95d..8b82325 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -4,3 +4,6 @@
[submodule "third_party/googletest"]
path = third_party/googletest
url = https://github.com/google/googletest
+[submodule "third_party/gflags"]
+ path = third_party/gflags
+ url = https://github.com/gflags/gflags
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0ac434c..e90be03 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,11 +7,21 @@ add_compile_options(-O0)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O0")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O0")
+# Google Flags Library
+add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/gflags EXCLUDE_FROM_ALL)
+
+file(GLOB util_files "util/*.cpp")
+add_library(util ${util_files})
+target_link_libraries(util gflags)
+
add_executable(exp_pairwise experiments_main.cpp)
+target_link_libraries(exp_pairwise util)
add_executable(sketch tensor_sketch_main.cpp)
+target_link_libraries(sketch util)
add_executable(seqgen util/seqgen.cpp)
+target_link_libraries(seqgen util)
# TESTS
string(APPEND CMAKE_CXX_FLAGS " -Wall -Wextra -Werror")
@@ -26,7 +36,7 @@ target_compile_options(gtest PRIVATE -w)
file(GLOB test_files "tests/**/*.cpp")
add_executable(tests ${test_files})
-target_link_libraries(tests gtest_main gtest gmock)
+target_link_libraries(tests gtest_main gtest gmock util)
target_include_directories(tests PRIVATE "include")
gtest_discover_tests(tests)
diff --git a/experiments_main.cpp b/experiments_main.cpp
index ed32da5..d4fe104 100644
--- a/experiments_main.cpp
+++ b/experiments_main.cpp
@@ -1,9 +1,9 @@
#include "util/args.hpp"
#include "util/modules.hpp"
-#include "util/multivec.h"
-#include "util/seqgen.h"
-#include "util/timer.h"
-#include "util/utils.h"
+#include "util/multivec.hpp"
+#include "util/seqgen.hpp"
+#include "util/timer.hpp"
+#include "util/utils.hpp"
#include
#include
@@ -14,10 +14,7 @@ namespace fs = std::filesystem;
using namespace ts;
struct KmerModule : public BasicModule {
- int original_sig_len {};
-
void override_module_params() override {
- original_sig_len = sig_len;
sig_len = int_pow(sig_len, kmer_size);
}
};
@@ -115,15 +112,15 @@ struct SeqGenModule {
}
void compute_sketches() {
- int num_seqs = seqs.size();
+ size_t num_seqs = seqs.size();
kmer_seqs.resize(num_seqs);
wmh_sketch.resize(num_seqs);
mh_sketch.resize(num_seqs);
omh_sketch.resize(num_seqs);
ten_sketch.resize(num_seqs);
slide_sketch.resize(num_seqs);
- for (int si = 0; si < num_seqs; si++) {
- seq2kmer(seqs[si], kmer_seqs[si], basicModules.kmer_size, basicModules.sig_len);
+ for (size_t si = 0; si < num_seqs; si++) {
+ kmer_seqs[si] = seq2kmer(seqs[si], basicModules.kmer_size, basicModules.sig_len);
minhash(kmer_seqs[si], mh_sketch[si], kmerModules.mh_params);
weighted_minhash(kmer_seqs[si], wmh_sketch[si], kmerModules.wmh_params);
if (basicModules.tuple_on_kmer) {
@@ -138,6 +135,7 @@ struct SeqGenModule {
tensor_slide_sketch(seqs[si], slide_sketch[si], basicModules.tensor_slide_params);
}
}
+
void compute_pairwise_dists() {
int num_seqs = seqs.size();
if (basicModules.mutation_pattern == "pairs") {
diff --git a/legacy/align_fasta.cpp b/legacy/align_fasta.cpp
index 955b3a0..f544490 100644
--- a/legacy/align_fasta.cpp
+++ b/legacy/align_fasta.cpp
@@ -3,8 +3,8 @@
#include "util/args.hpp"
#include "util/modules.hpp"
-#include "util/seqgen.h"
-#include "util/utils.h"
+#include "util/seqgen.hpp"
+#include "util/utils.hpp"
using namespace ts;
using namespace BasicTypes;
diff --git a/legacy/cross_comp.cpp b/legacy/cross_comp.cpp
index 3acbb62..973e714 100644
--- a/legacy/cross_comp.cpp
+++ b/legacy/cross_comp.cpp
@@ -3,8 +3,8 @@
#include "util/args.hpp"
#include "util/modules.hpp"
-#include "util/seqgen.h"
-#include "util/utils.h"
+#include "util/seqgen.hpp"
+#include "util/utils.hpp"
using namespace ts;
using namespace BasicTypes;
diff --git a/legacy/dists_pairwise.cpp b/legacy/dists_pairwise.cpp
index b52d09d..3b6f6b3 100644
--- a/legacy/dists_pairwise.cpp
+++ b/legacy/dists_pairwise.cpp
@@ -3,8 +3,8 @@
#include "util/args.hpp"
#include "util/modules.hpp"
-#include "util/seqgen.h"
-#include "util/utils.h"
+#include "util/seqgen.hpp"
+#include "util/utils.hpp"
using namespace ts;
using namespace BasicTypes;
diff --git a/legacy/long_seqs.cpp b/legacy/long_seqs.cpp
index 86376be..0ce2eef 100644
--- a/legacy/long_seqs.cpp
+++ b/legacy/long_seqs.cpp
@@ -3,8 +3,8 @@
#include "util/args.hpp"
#include "util/modules.hpp"
-#include "util/seqgen.h"
-#include "util/utils.h"
+#include "util/seqgen.hpp"
+#include "util/utils.hpp"
using namespace ts;
using namespace BasicTypes;
diff --git a/legacy/test_tensor_disc.cpp b/legacy/test_tensor_disc.cpp
index 63280a3..eb53e71 100644
--- a/legacy/test_tensor_disc.cpp
+++ b/legacy/test_tensor_disc.cpp
@@ -3,8 +3,8 @@
#include "util/args.hpp"
#include "util/modules.hpp"
-#include "util/seqgen.h"
-#include "util/utils.h"
+#include "util/seqgen.hpp"
+#include "util/utils.hpp"
using namespace ts;
using namespace BasicTypes;
diff --git a/sketch/minhash.hpp b/sketch/minhash.hpp
index be9d5cf..e2f40aa 100644
--- a/sketch/minhash.hpp
+++ b/sketch/minhash.hpp
@@ -1,7 +1,7 @@
#pragma once
-#include "util/timer.h"
-#include "util/utils.h"
+#include "util/timer.hpp"
+#include "util/utils.hpp"
#include
#include
diff --git a/sketch/omh.hpp b/sketch/omh.hpp
index af255ed..99bdb32 100644
--- a/sketch/omh.hpp
+++ b/sketch/omh.hpp
@@ -1,6 +1,6 @@
#pragma once
-#include "util/utils.h"
+#include "util/utils.hpp"
namespace ts { // ts = Tensor Sketch
diff --git a/sketch/tensor.hpp b/sketch/tensor.hpp
index dd2fb6a..ee985d6 100644
--- a/sketch/tensor.hpp
+++ b/sketch/tensor.hpp
@@ -3,7 +3,7 @@
#include
#include
-#include "util/multivec.h"
+#include "util/multivec.hpp"
namespace ts { // ts = Tensor Sketch
diff --git a/tensor_sketch_main.cpp b/tensor_sketch_main.cpp
index 3cd0f47..ea12a90 100644
--- a/tensor_sketch_main.cpp
+++ b/tensor_sketch_main.cpp
@@ -3,14 +3,16 @@
#include "util/args.hpp"
#include "util/modules.hpp"
-#include "util/seqgen.h"
-#include "util/utils.h"
+#include "util/seqgen.hpp"
+#include "util/utils.hpp"
#include
+#include
using namespace ts;
template
-struct SketchModule : public BasicModule {
+class SketchModule : public BasicModule {
+ public:
int original_sig_len {};
void override_module_params() override {
@@ -20,8 +22,6 @@ struct SketchModule : public BasicModule {
omh_params.max_len = win_len;
}
- void override_model_params() override { tensor_slide_params.embed_dim = embed_dim; }
-
SketchModule() {
directory = "./";
output = "data/sketches/";
@@ -39,82 +39,15 @@ struct SketchModule : public BasicModule {
sketch_method = "TenSlide";
}
- Vec2D seqs;
- Vec seq_names;
- string test_id;
-
-
- std::map chr2int
- = { { 'a', 1 }, { 'c', 2 }, { 'g', 3 }, { 't', 4 }, { 'n', 0 },
- { 'A', 1 }, { 'C', 2 }, { 'G', 3 }, { 'T', 4 }, { 'N', 0 } };
- std::map chr2int_mask
- = { { 'a', -1 }, { 'c', -2 }, { 'g', -3 }, { 't', -4 }, { 'n', 0 },
- { 'A', 1 }, { 'C', 2 }, { 'G', 3 }, { 'T', 4 }, { 'N', 0 } };
- void read_fasta() {
- seqs.clear();
- string file = (directory + input);
- std::ifstream infile = std::ifstream(file);
- assert(infile.is_open());
-
- string line;
- std::getline(infile, line);
- if (line[0] == '#') {
- test_id = line;
- std::getline(infile, line);
- }
- while (line[0] != '>') {
- std::getline(infile, line);
- }
- string name = line;
- Vec seq;
- while (std::getline(infile, line)) {
- if (line[0] == '>') {
- seqs.push_back(seq);
- seq_names.push_back(name);
- seq.clear();
- name = line;
- } else if (!line.empty()) {
- if (format_input == "fasta") {
- for (char c : line) {
- seq.push_back(chr2int[c]);
- }
- } else if (format_input == "csv") {
- std::stringstream ss(line);
- string item;
- while (std::getline(ss, item, ',')) {
- seq.push_back(std::stoi(item, 0, 16));
- }
- } else {
- std::cerr << " input format `" << format_input << "` does not exist\n";
- exit(1);
- }
- }
- }
- }
-
- void sketch_slice(Seq seq, Vec &embed) {
- if (sketch_method == "MH") {
- minhash(seq, embed, mh_params);
- } else if (sketch_method == "WMH") {
- weighted_minhash(seq, embed, wmh_params);
- } else if (sketch_method == "OMH") {
- ordered_minhash_flat(seq, embed, omh_params);
- } else if (sketch_method == "TenSketch") {
- tensor_sketch(seq, embed, tensor_params);
- } else {
- std::cerr << "method not recognized\n";
- exit(1);
- }
+ void read_input() {
+ std::tie(test_id, seqs, seq_names) = read_fasta(directory + input, format_input);
}
- Vec3D slide_sketch;
void compute_sketches() {
- // Vec args = {"MH", "WMH", "OMH", "TenSketch", "TenSlide"};
- int num_seqs = seqs.size();
+ size_t num_seqs = seqs.size();
slide_sketch = new3D(seqs.size(), embed_dim, 0);
- for (int si = 0; si < num_seqs; si++) {
- Vec kmers;
- seq2kmer(seqs[si], kmers, kmer_size, original_sig_len);
+ for (size_t si = 0; si < num_seqs; si++) {
+ Vec kmers = seq2kmer(seqs[si], kmer_size, original_sig_len);
if (sketch_method == "TenSlide") {
tensor_slide_sketch(kmers, slide_sketch[si], tensor_slide_params);
} else {
@@ -156,13 +89,35 @@ struct SketchModule : public BasicModule {
}
fo.close();
}
+
+ private:
+ void sketch_slice(Seq seq, Vec &embed) {
+ if (sketch_method == "MH") {
+ minhash(seq, embed, mh_params);
+ } else if (sketch_method == "WMH") {
+ weighted_minhash(seq, embed, wmh_params);
+ } else if (sketch_method == "OMH") {
+ ordered_minhash_flat(seq, embed, omh_params);
+ } else if (sketch_method == "TenSketch") {
+ tensor_sketch(seq, embed, tensor_params);
+ } else {
+ std::cerr << "Unkknown method: " << sketch_method << std::endl;
+ exit(1);
+ }
+ }
+
+ private:
+ Vec2D seqs;
+ Vec seq_names;
+ Vec3D slide_sketch;
+ string test_id;
};
int main(int argc, char *argv[]) {
SketchModule sketchModule;
sketchModule.parse(argc, argv);
sketchModule.models_init();
- sketchModule.read_fasta();
+ sketchModule.read_input();
sketchModule.compute_sketches();
sketchModule.save_output();
}
diff --git a/tests/util/test_multivec.cpp b/tests/util/test_multivec.cpp
index 2a8901d..a4861df 100644
--- a/tests/util/test_multivec.cpp
+++ b/tests/util/test_multivec.cpp
@@ -1,4 +1,4 @@
-#include "util/multivec.h"
+#include "util/multivec.hpp"
#include
diff --git a/tests/util/test_seqgen.cpp b/tests/util/test_seqgen.cpp
new file mode 100644
index 0000000..eaa1440
--- /dev/null
+++ b/tests/util/test_seqgen.cpp
@@ -0,0 +1,22 @@
+#include "util/seqgen.hpp"
+
+#include
+
+#include
+
+namespace {
+using namespace ts;
+
+template
+class Seq2Kmer : public ::testing::Test {};
+
+typedef ::testing::Types PowTypes;
+
+TYPED_TEST_SUITE(Seq2Kmer, PowTypes);
+
+TYPED_TEST(Seq2Kmer, Empty) {
+ Vec kmers = seq2kmer(Seq(), 31, 4);
+ ASSERT_EQ(0, kmers.size());
+}
+
+} // namespace
diff --git a/third_party/gflags b/third_party/gflags
new file mode 160000
index 0000000..827c769
--- /dev/null
+++ b/third_party/gflags
@@ -0,0 +1 @@
+Subproject commit 827c769e5fc98e0f2a34c47cef953cc6328abced
diff --git a/util/alphabets.cpp b/util/alphabets.cpp
new file mode 100644
index 0000000..0e31b41
--- /dev/null
+++ b/util/alphabets.cpp
@@ -0,0 +1,91 @@
+#include "alphabets.hpp"
+
+#include
+#include
+#include
+
+namespace ts {
+
+template
+class log2 {
+ static constexpr size_t _log2(size_t x) {
+ if (x < 2) {
+ return 0;
+ } else {
+ return _log2(x >> 1) + 1;
+ }
+ }
+
+ public:
+ static constexpr size_t value = _log2(n);
+};
+
+constexpr uint8_t alphabet_size_dna = 5;
+constexpr char alphabet_dna[] = "ACGTN";
+constexpr uint8_t bits_per_char_dna = 3;
+constexpr uint8_t char2int_tab_dna[128]
+ = { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 5, 2, 5, 5, 5, 3, 5, 5, 5, 5, 5, 5,
+ 0, 5, 5, 5, 5, 5, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 5, 2, 5, 5, 5, 3,
+ 5, 5, 5, 5, 5, 5, 0, 5, 5, 5, 5, 5, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5 };
+
+inline uint32_t char2int_dna(uint8_t c) {
+ return char2int_tab_dna[c];
+}
+
+constexpr char alphabet_protein[] = "ABCDEFGHIJKLMNOPQRSTUVWYZX";
+constexpr uint8_t alphabet_size_protein = sizeof(alphabet_protein) - 1;
+constexpr uint8_t bits_per_char_protein = log2::value + 1;
+constexpr uint8_t char2int_tab_protein[128]
+ = { 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 0,
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
+ 25, 23, 24, 25, 25, 25, 25, 25, 25, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
+ 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 25, 23, 24, 25, 25, 25, 25, 25 };
+
+inline uint32_t char2int_protein(uint8_t c) {
+ return char2int_tab_protein[c];
+}
+
+enum class AlphabetType { DNA, Protein };
+
+AlphabetType from_string(std::string str) {
+ std::transform(str.begin(), str.end(), str.begin(),
+ [](unsigned char c){ return std::tolower(c); });
+ if (str == "dna") {
+ return AlphabetType::DNA;
+ } else if (str == "protein") {
+ return AlphabetType::Protein;
+ } else {
+ throw std::logic_error("Invalid alphabet type");
+ }
+}
+
+std::function char2int;
+const char *alphabet;
+uint8_t alphabet_size;
+uint8_t bits_per_char;
+
+void init_alphabet(const std::string &alphabet_str) {
+ switch (from_string(alphabet_str)) {
+ case AlphabetType::DNA:
+ char2int = char2int_dna;
+ alphabet = alphabet_dna;
+ alphabet_size = alphabet_size_dna;
+ bits_per_char = bits_per_char_dna;
+ return;
+ case AlphabetType::Protein:
+ char2int = char2int_protein;
+ alphabet = alphabet_protein;
+ alphabet_size = alphabet_size_protein;
+ bits_per_char = bits_per_char_protein;
+ return;
+ default:
+ std::cerr << "Invalid alphabet type: " << alphabet_str << std::endl;
+ std::exit(1);
+ }
+}
+
+} // namespace ts
diff --git a/util/alphabets.hpp b/util/alphabets.hpp
new file mode 100644
index 0000000..eb196d3
--- /dev/null
+++ b/util/alphabets.hpp
@@ -0,0 +1,16 @@
+#pragma once
+
+#include
+#include
+#include
+
+namespace ts {
+
+extern std::function char2int;
+extern const char *alphabet;
+extern uint8_t alphabet_size;
+extern uint8_t bits_per_char;
+
+void init_alphabet(const std::string &alphabet_str);
+
+} // namespace ts
diff --git a/util/args.hpp b/util/args.hpp
index e19bb1f..fccab78 100644
--- a/util/args.hpp
+++ b/util/args.hpp
@@ -1,6 +1,6 @@
#pragma once
-#include "util/utils.h"
+#include "util/utils.hpp"
#include
#include
diff --git a/util/fasta.hpp b/util/fasta.hpp
new file mode 100644
index 0000000..e52453e
--- /dev/null
+++ b/util/fasta.hpp
@@ -0,0 +1,60 @@
+#pragma once
+
+#include "util/alphabets.hpp"
+#include "util/utils.hpp"
+
+#include
+
+namespace ts { // ts = Tensor Sketch
+
+/**
+ * Reads a fasta file and returns its contents as a tuple of .
+ * @tparam seq_type type used for storing a character of the fasta file, typically uint8_t
+ */
+template
+std::tuple, Vec>
+read_fasta(const std::string &file_name, const std::string &format_input) {
+ std::string test_id;
+ Vec2D seqs;
+ Vec seq_names;
+ std::ifstream infile(file_name);
+ assert(infile.is_open() && ("Could not open " + file_name).c_str());
+
+ std::string line;
+ std::getline(infile, line);
+ if (line[0] == '#') {
+ test_id = line;
+ std::getline(infile, line);
+ }
+ while (line[0] != '>') {
+ std::getline(infile, line);
+ }
+ std::string name = line;
+ Vec seq;
+ while (std::getline(infile, line)) {
+ if (line[0] == '>') {
+ seqs.push_back(seq);
+ seq_names.push_back(name);
+ seq.clear();
+ name = line;
+ } else if (!line.empty()) {
+ if (format_input == "fasta") {
+ for (char c : line) {
+ seq.push_back(char2int(c));
+ }
+ } else if (format_input == "csv") {
+ std::stringstream ss(line);
+ std::string item;
+ while (std::getline(ss, item, ',')) {
+ seq.push_back(std::stoi(item, 0, 16));
+ }
+ } else {
+ std::cerr << "Invalid input foramt: " << format_input << std::endl;
+ exit(1);
+ }
+ }
+ }
+ return { test_id, seqs, seq_names };
+}
+
+} // namespace ts
diff --git a/util/modules.hpp b/util/modules.hpp
index fbd31f1..f1f5259 100644
--- a/util/modules.hpp
+++ b/util/modules.hpp
@@ -1,7 +1,8 @@
#pragma once
+#include "util/alphabets.hpp"
#include "util/args.hpp"
-#include "util/seqgen.h"
+#include "util/seqgen.hpp"
#include "sketch/minhash.hpp"
#include "sketch/omh.hpp"
#include "sketch/tensor.hpp"
@@ -10,6 +11,10 @@
#include "sketch/tuple.hpp"
#include "sketch/wminhash.hpp"
+#include
+
+DEFINE_string(alphabet, "dna", "Alphabet of the input data: DNA or Protein [DNA]");
+
namespace ts { // ts = Tensor Sketch
struct BasicModule : public ArgSet {
@@ -91,6 +96,7 @@ struct BasicModule : public ArgSet {
public:
void models_init() {
+ init_alphabet(FLAGS_alphabet);
override_module_params();
init_seqgen(seq_gen);
init_mh_params(mh_params);
diff --git a/util/multivec.h b/util/multivec.hpp
similarity index 97%
rename from util/multivec.h
rename to util/multivec.hpp
index b669002..dc61428 100644
--- a/util/multivec.h
+++ b/util/multivec.hpp
@@ -1,6 +1,6 @@
#pragma once
-#include "util/utils.h"
+#include "util/utils.hpp"
namespace ts { // ts = Tensor Sketch
template
diff --git a/util/seqgen.cpp b/util/seqgen.cpp
index b7979d9..07fcd97 100644
--- a/util/seqgen.cpp
+++ b/util/seqgen.cpp
@@ -3,7 +3,7 @@
#include "util/args.hpp"
#include "util/modules.hpp"
-#include "util/utils.h"
+#include "util/utils.hpp"
using namespace ts;
diff --git a/util/seqgen.h b/util/seqgen.hpp
similarity index 89%
rename from util/seqgen.h
rename to util/seqgen.hpp
index 1c5a54a..b3a7d2b 100644
--- a/util/seqgen.h
+++ b/util/seqgen.hpp
@@ -11,40 +11,36 @@
namespace ts { // ts = Tensor Sketch
using string = std::string;
-// template
-// void read_fasta(Vec2D &seqs, Vec &seq_names, const string &filename, const
-// std::map &tr, int max_num_seqs = 1000) {
-// std::ifstream infile(filename);
-// string line;
-// int si = -1;
-// while (std::getline(infile, line) and seqs.size() <= max_num_seqs) {
-// if (line[0] == '>') {
-// si++;
-// seqs.push_back(Vec());
-// seq_names.push_back(line);
-// } else {
-// for (char c : line) {
-// seqs[si].push_back(tr[c]);
-// }
-// }
-// }
-// }
-
+/**
+ * Extracts kmers from a sequence.
+ * @tparam seq_type types of elements in the sequence
+ * @tparam embed_type type that stores a kmer
+ * @tparam size_type
+ * @param seq
+ * @param kmer_size
+ * @param sig_len
+ * @return
+ */
template
-void seq2kmer(const Seq &seq,
- Vec &vec,
+Vec seq2kmer(const Seq &seq,
size_type kmer_size,
size_type sig_len) {
- Timer::start("seq2kmer");
- vec = Vec(seq.size() - kmer_size + 1);
- for (size_t i = 0; i < vec.size(); i++) {
+ if (seq.size() < (size_t)kmer_size) {
+ return Vec();
+ }
+ Timer::start("seq2kmer");t
+
+ Vec result(seq.size() - kmer_size + 1);
+
+ for (size_t i = 0; i < result.size(); i++) {
size_type c = 1;
for (size_type j = 0; j < kmer_size; j++) {
- vec[i] += c * seq[i + j];
+ result[i] += c * seq[i + j];
c *= sig_len;
}
}
Timer::stop();
+ return result;
}
struct SeqGen {
diff --git a/util/timer.h b/util/timer.cpp
similarity index 84%
rename from util/timer.h
rename to util/timer.cpp
index ccdef80..8b3b28b 100644
--- a/util/timer.h
+++ b/util/timer.cpp
@@ -1,21 +1,14 @@
-#pragma once
-
-#include
-#include