diff --git a/neusomatic/cpp/msa.cpp b/neusomatic/cpp/msa.cpp index f7c3ba5..74e3098 100644 --- a/neusomatic/cpp/msa.cpp +++ b/neusomatic/cpp/msa.cpp @@ -1,47 +1,44 @@ -#include -#include -#include - -#include #include +#include + +#include +#include -#include +#include "msa_options.h" -void MSA(neusomatic::Options const& opt) -{ - using namespace seqan; - CharString seqFileName = opt.input().c_str(); - SeqFileIn seqFileIn(toCString(seqFileName)); - StringSet ids; - StringSet seqs; - try - { - readRecords(ids, seqs, seqFileIn); - } - catch (Exception const & e) - { - std::cout << "ERROR: " << e.what() << std::endl; - return; - } - int n = length(ids); - Align align; - resize(rows(align), n); - for (int i = 0; i < n; ++i){ - assignSource(row(align, i), seqs[i]); - } +void MSA(neusomatic::Options const& opt) { + using namespace seqan; + CharString seqFileName = opt.input().c_str(); + SeqFileIn seqFileIn(toCString(seqFileName)); + StringSet ids; + StringSet seqs; + try { + readRecords(ids, seqs, seqFileIn); + } catch (Exception const& e) { + std::cout << "ERROR: " << e.what() << std::endl; + return; + } + int n = length(ids); + Align align; + resize(rows(align), n); + for (int i = 0; i < n; ++i) { + assignSource(row(align, i), seqs[i]); + } - globalMsaAlignment(align, SimpleScore(opt.match_score(), -opt.mismatch_score(), -opt.gap_extension_score(), -opt.gap_open_score())); + globalMsaAlignment(align, + SimpleScore(opt.match_score(), -opt.mismatch_score(), + -opt.gap_extension_score(), -opt.gap_open_score())); - std::ofstream outfile; - outfile.open(opt.output().c_str()); - for (int i = 0; i < n; ++i){ - outfile << ">" << ids[i] << "\n"; - outfile << row(align, i) << "\n"; - } - outfile.close(); + std::ofstream outfile; + outfile.open(opt.output().c_str()); + for (int i = 0; i < n; ++i) { + outfile << ">" << ids[i] << "\n"; + outfile << row(align, i) << "\n"; + } + outfile.close(); } -int main(int argc, char *argv []) { +int main(int argc, char* argv[]) { neusomatic::Options opt(argc, argv); MSA(opt); return 0; diff --git a/neusomatic/cpp/scan_alignments.cpp b/neusomatic/cpp/scan_alignments.cpp index bea87da..02ff0a2 100644 --- a/neusomatic/cpp/scan_alignments.cpp +++ b/neusomatic/cpp/scan_alignments.cpp @@ -1,14 +1,10 @@ -/** - * File Name : - * Purpose : - * Creation Date : 28-08-2017 - * Last Modified : - * Created By : Mohammad Sahraeian - */ - - - - +/** + * File Name : + * Purpose : + * Creation Date : 28-08-2017 + * Last Modified : + * Created By : Mohammad Sahraeian + */ #include "SeqLib/BamReader.h" #include "SeqLib/RefGenome.h" @@ -25,14 +21,12 @@ #define BGZF_BLOCK_SIZE_BAK #endif - - -#include "vcf.h" +#include "Options.h" #include "bedio.hpp" -#include "targeted_loading.hpp" +#include "vcf.h" #include "msa.hpp" #include "msa_utils.hpp" -#include "Options.h" +#include "targeted_loading.hpp" #ifdef BGZF_MAX_BLOCK_SIZE_BAK #undef BGZF_MAX_BLOCK_SIZE_BAK @@ -44,8 +38,7 @@ #pragma pop_macro("BGZF_BLOCK_SIZE") #endif - -int main(int argc, char **argv) { +int main(int argc, char** argv) { neusomatic::Options opts(argc, argv); const std::string& bam_path = opts.bam_in(); const std::string& bed_in = opts.target_region_in(); @@ -54,20 +47,19 @@ int main(int argc, char **argv) { const std::string& count_out = opts.count_out(); int window_size = opts.window_size(); float snp_min_af = opts.snp_min_allele_freq(); - float ins_min_af = opts.ins_min_allele_freq();; - float del_min_af = opts.del_min_allele_freq();; + float ins_min_af = opts.ins_min_allele_freq(); + float del_min_af = opts.del_min_allele_freq(); int snp_min_ao = opts.snp_min_ao(); int snp_min_bq = opts.snp_min_bq(); int min_depth = opts.min_depth(); - const bool calculate_qual_stat = opts.calculate_qual_stat(); - const bool report_all_alleles = opts.report_all_alleles(); - const bool report_count_for_all_positions = opts.report_count_for_all_positions(); - - + const bool calculate_qual_stat = opts.calculate_qual_stat(); + const bool report_all_alleles = opts.report_all_alleles(); + const bool report_count_for_all_positions = opts.report_count_for_all_positions(); auto start_main = std::chrono::system_clock::now(); - //const std::map empty_pileup_counts = {{'-', 0}, {'A', 0}, {'C', 0}, {'G', 0}, {'T', 0}}; + // const std::map empty_pileup_counts = {{'-', 0}, {'A', 0}, {'C', 0}, {'G', + // 0}, {'T', 0}}; static const std::vector nuc_code_char = {'A', 'C', 'G', 'T', '-', 'N'}; const int matrix_base_pad = 7; @@ -81,115 +73,120 @@ int main(int argc, char **argv) { ref_seqs.LoadIndex(ref_in); neusomatic::bio::VCFWriter vcf_writer(vcf_out, ref_in, "VarCal"); - const unsigned contig_counts = seqan::length(seqan::contigNames(vcf_writer.vcf_context())); + const unsigned contig_counts = + seqan::length(seqan::contigNames(vcf_writer.vcf_context())); std::map chr_name_to_idx; for (unsigned i = 0; i < contig_counts; ++i) { - chr_name_to_idx[seqan::toCString(seqan::contigNames(vcf_writer.vcf_context())[i])] = i; + chr_name_to_idx[seqan::toCString(seqan::contigNames(vcf_writer.vcf_context())[i])] = + i; } std::ofstream count_out_writer; count_out_writer.open(count_out); std::string count_out_str = ""; - neusomatic::CaptureLayout capture_layout(bam_path, bed_regions, opts); - SeqLib::BamHeader bam_header = capture_layout.Reader().Header(); - + neusomatic::CaptureLayout capture_layout( + bam_path, bed_regions, opts); + SeqLib::BamHeader bam_header = capture_layout.Reader().Header(); + try { for (auto it = chr_name_to_idx.cbegin(); it != chr_name_to_idx.cend(); ++it) { if (bam_header.Name2ID(it->first) != it->second) { - std::cerr << " the bam file and the reference do not match.\n exit..\n please check the bam header and the reference file.\n"; + std::cerr << " the bam file and the reference do not match.\n exit..\n please " + "check the bam header and the reference file.\n"; exit(1); } } } catch (const std::out_of_range& oor) { - std::cerr << " the reference contains chromosome/contig name(s) that is/are not in the bam file.\n exit..\n please check the bam header and the reference file.\n"; + std::cerr + << " the reference contains chromosome/contig name(s) that is/are not in the bam " + "file.\n exit..\n please check the bam header and the reference file.\n"; exit(1); } catch (const std::invalid_argument& ia) { - std::cerr << " the reference contains chromosome/contig name(s) that is/are not in the bam file.\n exit..\n please check the bam header and the reference file.\n"; + std::cerr + << " the reference contains chromosome/contig name(s) that is/are not in the bam " + "file.\n exit..\n please check the bam header and the reference file.\n"; exit(1); } auto end_main_0 = std::chrono::system_clock::now(); - std::chrono::duration elapsed_seconds_main_0 = end_main_0-start_main; + std::chrono::duration elapsed_seconds_main_0 = end_main_0 - start_main; std::time_t end_time_main_0 = std::chrono::system_clock::to_time_t(end_main_0); - std::cout << "elapsed_main_0 time: " << elapsed_seconds_main_0.count() << "s\n"; - + std::cout << "elapsed_main_0 time: " << elapsed_seconds_main_0.count() << "s\n"; - int last_var_pos=-1; + int last_var_pos = -1; - int cnt_region=0; + int cnt_region = 0; auto timestamp = opts.verbosity() > 0; while (capture_layout.NextRegion(opts.fully_contained())) { - auto start_0_while = std::chrono::system_clock::now(); // a map from genomic interval -> a vector of alignReadIds for (auto targeted_region : capture_layout.ginv_records()) { - - auto start_0_for = std::chrono::system_clock::now(); auto start_0 = std::chrono::system_clock::now(); auto ginv = targeted_region.first; auto& records = targeted_region.second; - GInvStdString ginvstr(bam_header.IDtoName(ginv.contig()), ginv.left(), ginv.right()); + GInvStdString ginvstr(bam_header.IDtoName(ginv.contig()), ginv.left(), + ginv.right()); if (opts.verbosity() > 2 || (cnt_region % 100 == 0 || timestamp)) { - std::cerr<<"#On region "< opts.max_depth()) { records.resize(opts.max_depth()); } - const auto non_gapped_ref = ref_seqs.QueryRegion(ginvstr.contig(), ginvstr.left(), ginvstr.right() - 1); - if (ginv.length() > non_gapped_ref.size()) { + const auto non_gapped_ref = + ref_seqs.QueryRegion(ginvstr.contig(), ginvstr.left(), ginvstr.right() - 1); + if (ginv.length() > non_gapped_ref.size()) { ginv.right() = ginv.left() + non_gapped_ref.size(); } auto end_0 = std::chrono::system_clock::now(); - std::chrono::duration elapsed_seconds_0 = end_0-start_0; + std::chrono::duration elapsed_seconds_0 = end_0 - start_0; std::time_t end_time_0 = std::chrono::system_clock::to_time_t(end_0); - if (timestamp){ - std::cout << "elapsed_0 time: " << elapsed_seconds_0.count() << "s\n"; + if (timestamp) { + std::cout << "elapsed_0 time: " << elapsed_seconds_0.count() << "s\n"; } auto start_1 = std::chrono::system_clock::now(); neusomatic::CondensedArray condensed_array; auto end_1 = std::chrono::system_clock::now(); - std::chrono::duration elapsed_seconds_1 = end_1-start_1; + std::chrono::duration elapsed_seconds_1 = end_1 - start_1; std::time_t end_time_1 = std::chrono::system_clock::to_time_t(end_1); - if (timestamp){ - std::cout << "elapsed_1 time: " << elapsed_seconds_1.count() << "s\n"; + if (timestamp) { + std::cout << "elapsed_1 time: " << elapsed_seconds_1.count() << "s\n"; } auto start_2 = std::chrono::system_clock::now(); MSA msa(ginv, records, non_gapped_ref); auto end_2 = std::chrono::system_clock::now(); - std::chrono::duration elapsed_seconds_2 = end_2-start_2; + std::chrono::duration elapsed_seconds_2 = end_2 - start_2; std::time_t end_time_2 = std::chrono::system_clock::to_time_t(end_2); - if (timestamp){ - std::cout << "elapsed_2 time: " << elapsed_seconds_2.count() << "s\n"; + if (timestamp) { + std::cout << "elapsed_2 time: " << elapsed_seconds_2.count() << "s\n"; } auto start_3 = std::chrono::system_clock::now(); - + condensed_array = neusomatic::CondensedArray(msa, calculate_qual_stat); - auto end_3 = std::chrono::system_clock::now(); - std::chrono::duration elapsed_seconds_3 = end_3-start_3; + std::chrono::duration elapsed_seconds_3 = end_3 - start_3; std::time_t end_time_3 = std::chrono::system_clock::to_time_t(end_3); - if (timestamp){ - std::cout << "elapsed_3 time: " << elapsed_seconds_3.count() << "s\n"; + if (timestamp) { + std::cout << "elapsed_3 time: " << elapsed_seconds_3.count() << "s\n"; } // auto start_4 = std::chrono::system_clock::now(); - + // auto cols = condensed_array.GetColSpace(); // auto cols = condensed_array.GetColSpaceMQual(); // auto cols = condensed_array.GetColSpaceStrand(); @@ -200,42 +197,42 @@ int main(int argc, char **argv) { // const auto ref = condensed_array.GetGappedRef(); // const auto cc = neusomatic::ChangeCoordinates(ref); - // auto end_4 = std::chrono::system_clock::now(); // std::chrono::duration elapsed_seconds_4 = end_4-start_4; // std::time_t end_time_4 = std::chrono::system_clock::to_time_t(end_4); // if (timestamp){ - // std::cout << "elapsed_4 time: " << elapsed_seconds_4.count() << "s\n"; + // std::cout << "elapsed_4 time: " << elapsed_seconds_4.count() << "s\n"; // } auto start_4 = std::chrono::system_clock::now(); - + auto ncol = condensed_array.ncol(); const auto ref = condensed_array.GetGappedRef(); const auto cc = neusomatic::ChangeCoordinates(ref); auto end_4 = std::chrono::system_clock::now(); - std::chrono::duration elapsed_seconds_4 = end_4-start_4; + std::chrono::duration elapsed_seconds_4 = end_4 - start_4; std::time_t end_time_4 = std::chrono::system_clock::to_time_t(end_4); - if (timestamp){ - std::cout << "elapsed_4 time: " << elapsed_seconds_4.count() << "s\n"; + if (timestamp) { + std::cout << "elapsed_4 time: " << elapsed_seconds_4.count() << "s\n"; } auto start_5 = std::chrono::system_clock::now(); std::vector var_cols; - if (( cnt_region == 1 ) or (last_var_pos >= 0)) { + if ((cnt_region == 1) or (last_var_pos >= 0)) { var_cols.push_back(0); } last_var_pos = -1; for (size_t i = 0; i < ncol; i++) { - // auto start_4p0 = std::chrono::system_clock::now(); - if (ginv.left() + cc.UngapPos(i) >=ginv.right()){ break;} + if (ginv.left() + cc.UngapPos(i) >= ginv.right()) { + break; + } auto ref_base = ref[i]; ref_base = std::toupper(ref_base); auto ref_code = neusomatic::CondensedArray::DnaCharToDnaCode(ref_base); @@ -246,63 +243,67 @@ int main(int argc, char **argv) { auto base_freq_ = condensed_array.GetBaseFreq(i); - if (opts.verbosity()>1){ - std::cout<<"col "< 1) { + std::cout << "col " << i << ": "; + std::cout << "(ref= " << ref_base << ") "; + for (int base = 0; base < (int)base_freq_.size(); ++base) { + std::cout << "(" << nuc_code_char[base] << "): " << base_freq_[base] << "; "; } - std::cout<< std::endl; + std::cout << std::endl; } - - auto nrow = condensed_array.nrow()-base_freq_[5]; + auto nrow = condensed_array.nrow() - base_freq_[5]; base_freq_.erase(base_freq_.begin() + 5); std::vector pileup_counts(base_freq_.size()); - int total_count=0; - for (int base = 0; base < (int) base_freq_.size(); ++base) { + int total_count = 0; + for (int base = 0; base < (int)base_freq_.size(); ++base) { pileup_counts[base] = base_freq_[base]; - total_count+=base_freq_[base]; + total_count += base_freq_[base]; + } + + if (total_count == 0) { + continue; + } + if ((ref_base != '-') and (total_count < min_depth)) { + continue; } - - if (total_count==0) {continue;} - if ((ref_base!='-') and ( total_count elapsed_seconds_4p0 = end_4p0-start_4p0; // std::time_t end_time_4p0 = std::chrono::system_clock::to_time_t(end_4p0); - // std::cout << "elapsed_4p0 time: " << elapsed_seconds_4p0.count() << "s\n"; + // std::cout << "elapsed_4p0 time: " << elapsed_seconds_4p0.count() << "s\n"; // auto start_4p1 = std::chrono::system_clock::now(); - std::map alt_counts; auto ref_count = base_freq_[ref_code]; - auto var_code = ref_code; + auto var_code = ref_code; int var_count = 0; int dp = ref_count; - if (report_all_alleles and ref_base != '-'){ - for (int row = 0; row < base_freq_.size(); ++row) { + if (report_all_alleles and ref_base != '-') { + for (int row = 0; row < base_freq_.size(); ++row) { auto alt_cnt = base_freq_[row]; - if (( row != ref_code) and (alt_cnt > 0)){ - auto af = alt_cnt/float(alt_cnt+ref_count); - - if ((alt_cnt >= ref_count) or ((row == 4 and af > del_min_af ) or - (row != 4 and ref_base != '-' and ((af >= snp_min_af) or (alt_cnt >= snp_min_ao))) or - (ref_base =='-' and af > ins_min_af))){ - if (row != 4 and ref_base != '-'){ + if ((row != ref_code) and (alt_cnt > 0)) { + auto af = alt_cnt / float(alt_cnt + ref_count); + + if ((alt_cnt >= ref_count) or + ((row == 4 and af > del_min_af) or + (row != 4 and ref_base != '-' and + ((af >= snp_min_af) or (alt_cnt >= snp_min_ao))) or + (ref_base == '-' and af > ins_min_af))) { + if (row != 4 and ref_base != '-') { bool passed_bq = true; - if (calculate_qual_stat){ + if (calculate_qual_stat) { auto bqual_mean_ = condensed_array.GetBQMean(i); passed_bq = int(round(bqual_mean_[row])) >= snp_min_bq; } - if (not passed_bq){ + if (not passed_bq) { continue; } } @@ -311,7 +312,7 @@ int main(int argc, char **argv) { } } } - }else{ + } else { int major = -1; int major_count = 0; int minor = -1; @@ -319,7 +320,7 @@ int main(int argc, char **argv) { int minor2 = -1; int minor2_count = 0; - for (int row = 0; row < base_freq_.size(); ++row) { + for (int row = 0; row < base_freq_.size(); ++row) { if (base_freq_[row] > major_count) { minor2 = minor; minor2_count = minor_count; @@ -338,109 +339,119 @@ int main(int argc, char **argv) { } } - if (minor != -1 and major != -1){ - if (minor2 != -1 and ref_code == major and minor == 4 and ref_code != 4 ){ - if (minor2_count>0.5*minor_count){ + if (minor != -1 and major != -1) { + if (minor2 != -1 and ref_code == major and minor == 4 and ref_code != 4) { + if (minor2_count > 0.5 * minor_count) { minor = minor2; minor_count = minor2_count; } } } - auto af = minor_count/float(major_count+minor_count); - if (major != ref_code){ + auto af = minor_count / float(major_count + minor_count); + if (major != ref_code) { var_code = major; var_count = major_count; - } else if (minor != ref_code and ( (minor == 4 and af > del_min_af ) or - (minor != 4 and ref_base != '-' and ((af >= snp_min_af) or (minor_count >= snp_min_ao))) or - (ref_base =='-' and af > ins_min_af))){ - if (minor != 4 and ref_base != '-'){ + } else if (minor != ref_code and + ((minor == 4 and af > del_min_af) or + (minor != 4 and ref_base != '-' and + ((af >= snp_min_af) or (minor_count >= snp_min_ao))) or + (ref_base == '-' and af > ins_min_af))) { + if (minor != 4 and ref_base != '-') { bool passed_bq = true; - if (calculate_qual_stat){ + if (calculate_qual_stat) { auto bqual_mean_ = condensed_array.GetBQMean(i); passed_bq = int(round(bqual_mean_[minor])) >= snp_min_bq; } - if (not passed_bq){ + if (not passed_bq) { continue; } } var_code = minor; var_count = minor_count; } - if (var_count > 0) { - alt_counts.insert(std::pair(var_code,var_count)); + if (var_count > 0) { + alt_counts.insert(std::pair(var_code, var_count)); dp += var_count; } } // for(auto it = alt_counts.cbegin(); it != alt_counts.cend(); ++it) // { // std::cout << it->first << " " << it->second << std::endl; - // } - - + // } // auto end_4p1 = std::chrono::system_clock::now(); // std::chrono::duration elapsed_seconds_4p1 = end_4p1-start_4p1; // std::time_t end_time_4p1 = std::chrono::system_clock::to_time_t(end_4p1); - // std::cout << "elapsed_4p1 time: " << elapsed_seconds_4p1.count() << "s\n"; + // std::cout << "elapsed_4p1 time: " << elapsed_seconds_4p1.count() << "s\n"; // auto start_4p2 = std::chrono::system_clock::now(); - if ( alt_counts.size() > 0 ){ + if (alt_counts.size() > 0) { var_cols.push_back(i); last_var_pos = ginv.left() + cc.UngapPos(i); } - for(auto it = alt_counts.cbegin(); it != alt_counts.cend(); ++it) - { + for (auto it = alt_counts.cbegin(); it != alt_counts.cend(); ++it) { auto var_code_ = it->first; auto var_count_ = it->second; - auto record_info = "AF="+std::to_string((var_count_)/float(dp))+";DP="+std::to_string(nrow)+";RO="+std::to_string(ref_count)+";AO="+std::to_string(var_count_); - auto gtinfo = "0/1:"+std::to_string(nrow)+":"+std::to_string(ref_count)+":"+std::to_string(var_count_); - if (calculate_qual_stat){ - + auto record_info = "AF=" + std::to_string((var_count_) / float(dp)) + + ";DP=" + std::to_string(nrow) + + ";RO=" + std::to_string(ref_count) + + ";AO=" + std::to_string(var_count_); + auto gtinfo = "0/1:" + std::to_string(nrow) + ":" + std::to_string(ref_count) + + ":" + std::to_string(var_count_); + if (calculate_qual_stat) { auto bqual_mean_ = condensed_array.GetBQMean(i); auto mqual_mean_ = condensed_array.GetMQMean(i); auto strand_mean_ = condensed_array.GetStrandMean(i); auto lsc_mean_ = condensed_array.GetLSCMean(i); auto rsc_mean_ = condensed_array.GetRSCMean(i); auto tag_mean_ = condensed_array.GetTagMean(i); - - int rsc_counts=0; - int lsc_counts=0; - for(auto it = lsc_mean_.cbegin(); it != lsc_mean_.cend(); ++it) { - rsc_counts+=*it; + + int rsc_counts = 0; + int lsc_counts = 0; + for (auto it = lsc_mean_.cbegin(); it != lsc_mean_.cend(); ++it) { + rsc_counts += *it; } - for(auto it = rsc_mean_.cbegin(); it != rsc_mean_.cend(); ++it) { - lsc_counts+=*it; + for (auto it = rsc_mean_.cbegin(); it != rsc_mean_.cend(); ++it) { + lsc_counts += *it; } - - record_info += ";ST="+std::to_string(int(round(ref_count*(strand_mean_[ref_code]/100))))+ \ - ","+std::to_string(int(round(var_count_*(strand_mean_[var_code_]/100))))+ \ - ";LS="+std::to_string(lsc_counts)+\ - ";RS="+std::to_string(rsc_counts)+\ - ";NM="+std::to_string(int(round(tag_mean_[var_code_][0])))+\ - ";AS="+std::to_string(int(round(tag_mean_[var_code_][1])))+ \ - ";XS="+std::to_string(int(round(tag_mean_[var_code_][2])))+ \ - ";PR="+std::to_string(int(round(tag_mean_[var_code_][3])))+ \ - ";CL="+std::to_string(int(round(tag_mean_[var_code_][4])))+ \ - ";MQ="+std::to_string(int(round(mqual_mean_[var_code_])))+ \ - ";BQ="+std::to_string(int(round(bqual_mean_[var_code_]))); - gtinfo += ":"+std::to_string(int(round(ref_count*(strand_mean_[ref_code]/100))))+","+ \ - std::to_string(int(round(var_count_*(strand_mean_[var_code_]/100))))+":"+\ - std::to_string(lsc_counts)+":"+\ - std::to_string(rsc_counts)+":"+\ - std::to_string(int(round(tag_mean_[var_code_][0])))+":"+\ - std::to_string(int(round(tag_mean_[var_code_][1])))+":"+\ - std::to_string(int(round(tag_mean_[var_code_][2])))+":"+\ - std::to_string(int(round(tag_mean_[var_code_][3])))+":"+\ - std::to_string(int(round(tag_mean_[var_code_][4])))+":"+\ - std::to_string(int(round(mqual_mean_[var_code_])))+":"+\ - std::to_string(int(round(bqual_mean_[var_code_]))); + record_info += + ";ST=" + + std::to_string(int(round(ref_count * (strand_mean_[ref_code] / 100)))) + + "," + + std::to_string(int(round(var_count_ * (strand_mean_[var_code_] / 100)))) + + ";LS=" + std::to_string(lsc_counts) + + ";RS=" + std::to_string(rsc_counts) + + ";NM=" + std::to_string(int(round(tag_mean_[var_code_][0]))) + + ";AS=" + std::to_string(int(round(tag_mean_[var_code_][1]))) + + ";XS=" + std::to_string(int(round(tag_mean_[var_code_][2]))) + + ";PR=" + std::to_string(int(round(tag_mean_[var_code_][3]))) + + ";CL=" + std::to_string(int(round(tag_mean_[var_code_][4]))) + + ";MQ=" + std::to_string(int(round(mqual_mean_[var_code_]))) + + ";BQ=" + std::to_string(int(round(bqual_mean_[var_code_]))); + gtinfo += + ":" + + std::to_string(int(round(ref_count * (strand_mean_[ref_code] / 100)))) + + "," + + std::to_string(int(round(var_count_ * (strand_mean_[var_code_] / 100)))) + + ":" + std::to_string(lsc_counts) + ":" + std::to_string(rsc_counts) + + ":" + std::to_string(int(round(tag_mean_[var_code_][0]))) + ":" + + std::to_string(int(round(tag_mean_[var_code_][1]))) + ":" + + std::to_string(int(round(tag_mean_[var_code_][2]))) + ":" + + std::to_string(int(round(tag_mean_[var_code_][3]))) + ":" + + std::to_string(int(round(tag_mean_[var_code_][4]))) + ":" + + std::to_string(int(round(mqual_mean_[var_code_]))) + ":" + + std::to_string(int(round(bqual_mean_[var_code_]))); } - auto var_base = nuc_code_char[var_code_]; - if (ref_base == '-') {ref_base = 'N';} - if (var_base == '-') {var_base = 'N';} - auto var_ref_pos=ginv.left() + cc.UngapPos(i); + auto var_base = nuc_code_char[var_code_]; + if (ref_base == '-') { + ref_base = 'N'; + } + if (var_base == '-') { + var_base = 'N'; + } + auto var_ref_pos = ginv.left() + cc.UngapPos(i); seqan::VcfRecord record; record.rID = ginv.contig(); record.beginPos = var_ref_pos; @@ -450,53 +461,56 @@ int main(int argc, char **argv) { record.qual = 100; record.filter = "."; record.info = record_info; - char * info; - if (calculate_qual_stat){ + char* info; + if (calculate_qual_stat) { record.format = "GT:DP:RO:AO:ST:LS:RS:NM:AS:XS:PR:CL:MQ:BQ"; - }else{ + } else { record.format = "GT:DP:RO:AO"; } appendValue(record.genotypeInfos, gtinfo); vcf_writer.Write(record); - if (opts.verbosity()>1){ - std::cout<<"var: " << i << "," << var_ref_pos << ","<< ref_base << "," << var_base<<","< 1) { + std::cout << "var: " << i << "," << var_ref_pos << "," << ref_base << "," + << var_base << "," << nrow << ":" << ref_count << ":" << var_count_ + << std::endl; + std::cout << "col " << i << ": "; + std::cout << "(ref= " << ref_base << ") "; for (size_t row = 0; row < base_freq_.size(); ++row) { - std::cout<<"("< elapsed_seconds_4p2 = end_4p2-start_4p2; // std::time_t end_time_4p2 = std::chrono::system_clock::to_time_t(end_4p2); - // std::cout << "elapsed_4p2 time: " << elapsed_seconds_4p2.count() << "s\n"; - + // std::cout << "elapsed_4p2 time: " << elapsed_seconds_4p2.count() << "s\n"; } - if (last_var_pos>=0){ - if (ginv.right()>(last_var_pos+matrix_base_pad+2)){ - last_var_pos=-1; + if (last_var_pos >= 0) { + if (ginv.right() > (last_var_pos + matrix_base_pad + 2)) { + last_var_pos = -1; } } - var_cols.push_back(std::max(0,int(ncol)-1)); + var_cols.push_back(std::max(0, int(ncol) - 1)); std::vector count_cols; - if (report_count_for_all_positions){ + if (report_count_for_all_positions) { for (size_t i = 0; i < ncol; i++) { count_cols.push_back(i); } - }else{ + } else { int processed_col = -1; - for (auto& j : var_cols){ + for (auto& j : var_cols) { // std::cout << j << "," << processed_col << std::endl; std::vector count_cols_0; int cnt_0 = 0; - for (auto i=j; i >= (processed_col+1); --i){ - if (ginv.left() + cc.UngapPos(i) >=ginv.right()){ break;} + for (auto i = j; i >= (processed_col + 1); --i) { + if (ginv.left() + cc.UngapPos(i) >= ginv.right()) { + break; + } auto ref_base = ref[i]; ref_base = std::toupper(ref_base); auto ref_code = neusomatic::CondensedArray::DnaCharToDnaCode(ref_base); @@ -508,36 +522,38 @@ int main(int argc, char **argv) { continue; } cnt_0++; - if (cnt_0 >= matrix_base_pad+2){ + if (cnt_0 >= matrix_base_pad + 2) { break; } } - std::reverse(count_cols_0.begin(),count_cols_0.end()); + std::reverse(count_cols_0.begin(), count_cols_0.end()); count_cols.insert(count_cols.end(), count_cols_0.begin(), count_cols_0.end()); std::vector count_cols_1; int cnt_1 = 0; - for (auto i=j+1; i < ncol; ++i){ - if (ginv.left() + cc.UngapPos(i) >=ginv.right()){ break;} + for (auto i = j + 1; i < ncol; ++i) { + if (ginv.left() + cc.UngapPos(i) >= ginv.right()) { + break; + } auto ref_base = ref[i]; ref_base = std::toupper(ref_base); auto ref_code = neusomatic::CondensedArray::DnaCharToDnaCode(ref_base); if (ref_base == 'N') { ref_base = '-'; } - if (i >= (processed_col+1)){ + if (i >= (processed_col + 1)) { count_cols_1.push_back(i); } if (ref_base == '-') { continue; } cnt_1++; - if (cnt_1 >= matrix_base_pad+1){ + if (cnt_1 >= matrix_base_pad + 1) { break; } } - count_cols.insert(count_cols.end(), count_cols_1.begin(), count_cols_1.end()); - if (! count_cols.empty()) { + count_cols.insert(count_cols.end(), count_cols_1.begin(), count_cols_1.end()); + if (!count_cols.empty()) { processed_col = count_cols.back(); // std::cout << ginv.left() + cc.UngapPos(processed_col) << std::endl; } @@ -547,19 +563,18 @@ int main(int argc, char **argv) { // for (auto& iii : count_cols_1){ // std::cout << "count_cols_1, " << iii << std::endl; // } - } } auto end_5 = std::chrono::system_clock::now(); - std::chrono::duration elapsed_seconds_5 = end_5-start_5; + std::chrono::duration elapsed_seconds_5 = end_5 - start_5; std::time_t end_time_5 = std::chrono::system_clock::to_time_t(end_5); - if (timestamp){ - std::cout << "elapsed_5 time: " << elapsed_seconds_5.count() << "s\n"; + if (timestamp) { + std::cout << "elapsed_5 time: " << elapsed_seconds_5.count() << "s\n"; } auto start_6 = std::chrono::system_clock::now(); - for (auto& i : count_cols){ + for (auto& i : count_cols) { auto ref_base = ref[i]; ref_base = std::toupper(ref_base); auto ref_code = neusomatic::CondensedArray::DnaCharToDnaCode(ref_base); @@ -567,8 +582,8 @@ int main(int argc, char **argv) { if (ref_base == 'N') { ref_base = '-'; } - auto start_pos=ginv.left() + cc.UngapPos(i); - if (ref_base!='-'){ + auto start_pos = ginv.left() + cc.UngapPos(i); + if (ref_base != '-') { start_pos++; } @@ -576,22 +591,22 @@ int main(int argc, char **argv) { base_freq_.erase(base_freq_.begin() + 5); std::vector pileup_counts(base_freq_.size()); - int total_count=0; - for (int base = 0; base < (int) base_freq_.size(); ++base) { + int total_count = 0; + for (int base = 0; base < (int)base_freq_.size(); ++base) { pileup_counts[base] = base_freq_[base]; - total_count+=base_freq_[base]; + total_count += base_freq_[base]; + } + if (total_count == 0) { + continue; } - if (total_count==0) {continue;} - if (calculate_qual_stat){ - + if (calculate_qual_stat) { auto bqual_mean_ = condensed_array.GetBQMean(i); auto mqual_mean_ = condensed_array.GetMQMean(i); auto strand_mean_ = condensed_array.GetStrandMean(i); auto lsc_mean_ = condensed_array.GetLSCMean(i); auto rsc_mean_ = condensed_array.GetRSCMean(i); auto tag_mean_ = condensed_array.GetTagMean(i); - // count_out_writer< elapsed_seconds_6 = end_6-start_6; + std::chrono::duration elapsed_seconds_6 = end_6 - start_6; std::time_t end_time_6 = std::chrono::system_clock::to_time_t(end_6); - if (timestamp){ - std::cout << "elapsed_6 time: " << elapsed_seconds_6.count() << "s\n"; + if (timestamp) { + std::cout << "elapsed_6 time: " << elapsed_seconds_6.count() << "s\n"; } auto end_0_for = std::chrono::system_clock::now(); - std::chrono::duration elapsed_seconds_0_for = end_0_for-start_0_for; + std::chrono::duration elapsed_seconds_0_for = end_0_for - start_0_for; std::time_t end_time_0_for = std::chrono::system_clock::to_time_t(end_0_for); - if (timestamp){ - std::cout << "elapsed_0_for time: " << elapsed_seconds_0_for.count() << "s\n"; + if (timestamp) { + std::cout << "elapsed_0_for time: " << elapsed_seconds_0_for.count() << "s\n"; } - } //end for + } // end for auto end_0_while = std::chrono::system_clock::now(); - std::chrono::duration elapsed_seconds_0_while = end_0_while-start_0_while; + std::chrono::duration elapsed_seconds_0_while = end_0_while - start_0_while; std::time_t end_time_0_while = std::chrono::system_clock::to_time_t(end_0_while); - if (timestamp){ - std::cout << "elapsed_0_while time: " << elapsed_seconds_0_while.count() << "s\n"; + if (timestamp) { + std::cout << "elapsed_0_while time: " << elapsed_seconds_0_while.count() << "s\n"; } - } //end while - + } // end while auto start_w = std::chrono::system_clock::now(); // count_out_writer << count_out_str; auto end_w = std::chrono::system_clock::now(); - std::chrono::duration elapsed_seconds_w = end_w-start_w; + std::chrono::duration elapsed_seconds_w = end_w - start_w; std::time_t end_time_w = std::chrono::system_clock::to_time_t(end_w); - std::cout << "elapsed_w time: " << elapsed_seconds_w.count() << "s\n"; - - + std::cout << "elapsed_w time: " << elapsed_seconds_w.count() << "s\n"; auto start_w2 = std::chrono::system_clock::now(); count_out_writer.close(); auto end_w2 = std::chrono::system_clock::now(); - std::chrono::duration elapsed_seconds_w2 = end_w2-start_w2; + std::chrono::duration elapsed_seconds_w2 = end_w2 - start_w2; std::time_t end_time_w2 = std::chrono::system_clock::to_time_t(end_w2); - std::cout << "elapsed_w2 time: " << elapsed_seconds_w2.count() << "s\n"; - - - + std::cout << "elapsed_w2 time: " << elapsed_seconds_w2.count() << "s\n"; auto end_main = std::chrono::system_clock::now(); - std::chrono::duration elapsed_seconds_main = end_main-start_main; + std::chrono::duration elapsed_seconds_main = end_main - start_main; std::time_t end_time_main = std::chrono::system_clock::to_time_t(end_main); - std::cout << "elapsed_main time: " << elapsed_seconds_main.count() << "s\n"; + std::cout << "elapsed_main time: " << elapsed_seconds_main.count() << "s\n"; return 0; } diff --git a/neusomatic/include/Interval.hpp b/neusomatic/include/Interval.hpp index 9ddbe5d..52e4aa3 100644 --- a/neusomatic/include/Interval.hpp +++ b/neusomatic/include/Interval.hpp @@ -1,35 +1,30 @@ #ifndef INTERVAL_HPP #define INTERVAL_HPP -#include -#include -#include -#include -#include #include +#include +#include +#include +#include +#include -namespace neusomatic { namespace bio { - -enum class StrandType{ - PLUS=0, - MINUS, - UNKNOWN -}; - -inline std::ostream& operator<<(std::ostream &os, const StrandType& st) -{ - switch(st) - { - case StrandType::PLUS: - os<<"+"; - break; - case StrandType::MINUS: - os<<"-"; - break; - case StrandType::UNKNOWN: - os<<"?"; - break; +namespace neusomatic { +namespace bio { + +enum class StrandType { PLUS = 0, MINUS, UNKNOWN }; + +inline std::ostream& operator<<(std::ostream& os, const StrandType& st) { + switch (st) { + case StrandType::PLUS: + os << "+"; + break; + case StrandType::MINUS: + os << "-"; + break; + case StrandType::UNKNOWN: + os << "?"; + break; } return os; } @@ -37,122 +32,124 @@ inline std::ostream& operator<<(std::ostream &os, const StrandType& st) // // Utility functions: // -template -inline decltype (auto) coverage(const std::vector& invs) -> std::vector { - +template +inline decltype(auto) coverage(const std::vector& invs) + -> std::vector { typedef typename Interval::Depth CovType; - const auto& lmin = std::min_element(invs.begin(), invs.end(), - [&](const auto& lhs, const auto& rhs ){ - return lhs.left() < rhs.left(); - }); - const auto& rmax = std::max_element(invs.begin(), invs.end(), - [&](const auto& lhs, const auto& rhs ){ - return lhs.right() < rhs.right(); - }); + const auto& lmin = std::min_element( + invs.begin(), invs.end(), + [&](const auto& lhs, const auto& rhs) { return lhs.left() < rhs.left(); }); + const auto& rmax = std::max_element( + invs.begin(), invs.end(), + [&](const auto& lhs, const auto& rhs) { return lhs.right() < rhs.right(); }); int left = lmin->left(); int right = rmax->right(); assert(left < right); std::vector coverage; - if (half_open) coverage.resize(right - left, 0); - else coverage.resize(right - left + 1, 0); + if (half_open) + coverage.resize(right - left, 0); + else + coverage.resize(right - left + 1, 0); for (auto s = invs.cbegin(); s != invs.cend(); ++s) { int end = 0; - if (half_open) end = s->right(); - else end = s->right() + 1; + if (half_open) + end = s->right(); + else + end = s->right() + 1; for (int p = s->left(); p < end; ++p) { - coverage[p-left] += s->depth(); + coverage[p - left] += s->depth(); } } return coverage; } -// -// -// -struct BasicInterval{ +struct BasicInterval { /* * half open interval encoding. * e.g. [left, right) -> coverage */ - using Depth = int ; - BasicInterval(): i1(std::numeric_limits::max()), - i2(std::numeric_limits::min()){} + using Depth = int; + BasicInterval() + : i1(std::numeric_limits::max()), i2(std::numeric_limits::min()) {} BasicInterval(int l, int r) : i1(l), i2(r) {} - virtual ~BasicInterval() {} // virtual distructor - virtual const int left() const {return i1;} - virtual const int right() const {return i2;} - virtual int& left() {return i1;} - virtual int& right() {return i2;} - virtual int depth() const {return 1;} + virtual ~BasicInterval() {} // virtual distructor + virtual const int left() const { return i1; } + virtual const int right() const { return i2; } + virtual int& left() { return i1; } + virtual int& right() { return i2; } + virtual int depth() const { return 1; } virtual size_t length() const { - if (right() < left() ) return 0; - else return right() - left(); + if (right() < left()) + return 0; + else + return right() - left(); } virtual bool Valid() const { return (i1 != std::numeric_limits::max() && i2 != std::numeric_limits::min()); } - int i1; // compatible with seqan + int i1; // compatible with seqan int i2; }; -inline std::ostream& operator<<(std::ostream& os, const BasicInterval& bi){ - os <<"interval ["< rhs.left()) { + } else if (lhs.left() > rhs.left()) { return false; } else { - if (lhs.right() < rhs.right()) return true; - else return false; + if (lhs.right() < rhs.right()) + return true; + else + return false; } } - -template -class GenomicInterval : public BasicInterval{ +template +class GenomicInterval : public BasicInterval { /* * In compliance with seqan: 0-based, half open interval [a, b). - * Contig is the + * Contig is the */ public: using TContig = Contig; - GenomicInterval(): contig_(), strand_(StrandType::UNKNOWN){} - GenomicInterval(Contig ctg, StrandType strand): contig_(ctg), strand_(strand){} + GenomicInterval() : contig_(), strand_(StrandType::UNKNOWN) {} + GenomicInterval(Contig ctg, StrandType strand) : contig_(ctg), strand_(strand) {} - GenomicInterval(const Contig& ctg, int l, int r): - BasicInterval(l,r), contig_(ctg), strand_(StrandType::UNKNOWN){} + GenomicInterval(const Contig& ctg, int l, int r) + : BasicInterval(l, r), contig_(ctg), strand_(StrandType::UNKNOWN) {} - GenomicInterval(const Contig& ctg, const StrandType& s, int l, int r): - BasicInterval(l, r), contig_(ctg), strand_(s) {} + GenomicInterval(const Contig& ctg, const StrandType& s, int l, int r) + : BasicInterval(l, r), contig_(ctg), strand_(s) {} - GenomicInterval(const Contig& ctg, const StrandType& s, int l, int r, const Haplo& hap): - BasicInterval(l, r), contig_(ctg), strand_(s), haplotype_(hap) {} + GenomicInterval(const Contig& ctg, const StrandType& s, int l, int r, const Haplo& hap) + : BasicInterval(l, r), contig_(ctg), strand_(s), haplotype_(hap) {} - //int lenght() const {return right() - left();} + // int lenght() const {return right() - left();} - decltype(auto) contig() const {return contig_;} - decltype(auto) contig() {return (contig_);} + decltype(auto) contig() const { return contig_; } + decltype(auto) contig() { return (contig_); } - decltype(auto) strand() const {return strand_;} + decltype(auto) strand() const { return strand_; } - decltype(auto) haplotype() const {return haplotype_;} - decltype(auto) haplotype() {return haplotype_;} + decltype(auto) haplotype() const { return haplotype_; } + decltype(auto) haplotype() { return haplotype_; } std::string string() const { std::string result; @@ -166,44 +163,45 @@ class GenomicInterval : public BasicInterval{ private: Contig contig_; - StrandType strand_; + StrandType strand_; Haplo haplotype_; }; -template -inline bool operator==(const GenomicInterval& lhs,const GenomicInterval& rhs) -{ - if (lhs.contig() != rhs.contig()) return false; - if (lhs.strand() != rhs.strand()) return false; - if (lhs.left() != rhs.left()) return false; - if (lhs.right() != rhs.right()) return false; - if (lhs.haplotype() != rhs.haplotype()) return false; +template +inline bool operator==(const GenomicInterval& lhs, + const GenomicInterval& rhs) { + if (lhs.contig() != rhs.contig()) + return false; + if (lhs.strand() != rhs.strand()) + return false; + if (lhs.left() != rhs.left()) + return false; + if (lhs.right() != rhs.right()) + return false; + if (lhs.haplotype() != rhs.haplotype()) + return false; return true; } -template -inline bool operator!=(const GenomicInterval& lhs,const GenomicInterval& rhs) -{ - return !(lhs==rhs); +template +inline bool operator!=(const GenomicInterval& lhs, + const GenomicInterval& rhs) { + return !(lhs == rhs); +} + +template +inline std::ostream& operator<<(std::ostream& os, + const GenomicInterval& gInv) { + // print based on 1-base coordinates; + os << gInv.contig() << ":" << gInv.strand() << " [" << gInv.left() + 1 << "," + << gInv.right() + 1 << ") " << gInv.haplotype(); + return os; } -template -inline std::ostream& operator<<(std::ostream& os, - const GenomicInterval& gInv) -{ - //print based on 1-base coordinates; - os<< gInv.contig() - <<":"< -inline bool operator<(const GenomicInterval & lhs, - const GenomicInterval& rhs){ - //sort contig by alphabetic order +template +inline bool operator<(const GenomicInterval& lhs, + const GenomicInterval& rhs) { + // sort contig by alphabetic order if (lhs.contig() < rhs.contig()) { return true; } else if (lhs.contig() > rhs.contig()) { @@ -211,29 +209,30 @@ inline bool operator<(const GenomicInterval & lhs, } else { if (lhs.left() < rhs.left()) { return true; - } else if (lhs.left() > rhs.left()) { + } else if (lhs.left() > rhs.left()) { return false; } else { - if (lhs.right() < rhs.right()) return true; - else return false; + if (lhs.right() < rhs.right()) + return true; + else + return false; } } } -template -class IRanges{ +template +class IRanges { /* * Use BasicInterval as default interval representation inside class */ private: - std::vector invs_; int left_; int right_; typename Interval::TContig contig_; - void Setup_(){ + void Setup_() { if (invs_.empty()) { throw std::runtime_error("IRangs cannot build empty interval set"); } @@ -243,15 +242,13 @@ class IRanges{ throw std::runtime_error("IRangs intervals cannot come from different contigs"); } } - const auto& lmin = std::min_element(invs_.begin(), invs_.end(), - [&](const auto& lhs, const auto& rhs ){ - return lhs.left() < rhs.left(); - }); + const auto& lmin = std::min_element( + invs_.begin(), invs_.end(), + [&](const auto& lhs, const auto& rhs) { return lhs.left() < rhs.left(); }); - const auto& rmax = std::max_element(invs_.begin(), invs_.end(), - [&](const auto& lhs, const auto& rhs ){ - return lhs.right() < rhs.right(); - }); + const auto& rmax = std::max_element( + invs_.begin(), invs_.end(), + [&](const auto& lhs, const auto& rhs) { return lhs.right() < rhs.right(); }); left_ = lmin->left(); right_ = rmax->right(); @@ -261,14 +258,14 @@ class IRanges{ typedef typename std::vector::const_iterator const_iterator; typedef typename std::vector::size_type size_type; - - template > + template > IRanges(IntVec starts, IntVec ends) { - assert (starts.size() == ends.size()); + assert(starts.size() == ends.size()); auto e = ends.cbegin(); for (auto s = starts.cbegin(); s != starts.cend(); ++s) { - int end_pos = *e ; - if (!half_open) ++end_pos; // if input is a close interval, convert to half open interval + int end_pos = *e; + if (!half_open) + ++end_pos; // if input is a close interval, convert to half open interval Interval inv(*s, end_pos); invs_.push_back(move(inv)); ++e; @@ -276,26 +273,26 @@ class IRanges{ Setup_(); } - IRanges(std::vector invs): invs_(invs) { + IRanges(std::vector invs) : invs_(invs) { if (!half_open) { - for (Interval& inv: invs_) { - inv.right()++ ; - } + for (Interval& inv : invs_) { + inv.right()++; + } } Setup_(); } size_type size() { return invs_.size(); } - const_iterator cbegin() {return invs_.cbegin();} - const_iterator cend() {return invs_.cend();} + const_iterator cbegin() { return invs_.cbegin(); } + const_iterator cend() { return invs_.cend(); } std::vector reduce() const { /* * Merge redundant ranges, and return the minimum * non-overlapping ranges covering all the input ranges. */ - std::vector cov = coverage (invs_); + std::vector cov = coverage(invs_); std::vector result; bool prev_base_is_covered = false; @@ -304,7 +301,6 @@ class IRanges{ for (auto it = cov.cbegin(); it != cov.cend(); ++it) { if (*it > 0) { if (prev_base_is_covered) { - } else { Interval inv; inv.contig() = contig_; @@ -318,7 +314,6 @@ class IRanges{ result.back().right() = left_ + std::distance(cov.cbegin(), it); open = false; } else { - } prev_base_is_covered = false; } @@ -328,22 +323,22 @@ class IRanges{ } if (!half_open) { - for (auto& res: result) res.right() = res.right() -1; + for (auto& res : result) res.right() = res.right() - 1; } return result; } std::vector disjoint() const { - /* - * return non-overlapping intervals - */ - std::vector cov = coverage (invs_); + /* + * return non-overlapping intervals + */ + std::vector cov = coverage(invs_); std::vector bars; std::vector result; - for (const auto& inv: invs_) { - bars.push_back( inv.left() ); - bars.push_back( inv.right() ); + for (const auto& inv : invs_) { + bars.push_back(inv.left()); + bars.push_back(inv.right()); } sort(bars.begin(), bars.end()); const auto& last = unique(bars.begin(), bars.end()); @@ -360,10 +355,10 @@ class IRanges{ } else { if (*it == result.back().left()) { result.pop_back(); - } - else { + } else { result.back().right() = *it; - if (cov[*it - left_ ] > 0) --it; + if (cov[*it - left_] > 0) + --it; } is_left = true; } @@ -373,13 +368,12 @@ class IRanges{ } if (!half_open) { - for (auto& res: result) res.right() = res.right() -1 ; + for (auto& res : result) res.right() = res.right() - 1; } return result; } }; - /* * Below are utility functions: * Those functions must take half closed interval [a,b) @@ -387,75 +381,77 @@ class IRanges{ */ template -inline bool IsOverlapped(const TInterval &x, const TInterval &y) -{ - //intervally this function use closed interval - int yi2 = y.i2 -1; - int xi2 = x.i2 -1; - if ( x.i1 <= yi2 && y.i1 <= xi2 ) return true; - else return false; +inline bool IsOverlapped(const TInterval& x, const TInterval& y) { + // intervally this function use closed interval + int yi2 = y.i2 - 1; + int xi2 = x.i2 - 1; + if (x.i1 <= yi2 && y.i1 <= xi2) + return true; + else + return false; } template -inline bool IsContainedIn(const TInterval &x, const TInterval &y) -{ - if (x.contig() != y.contig()) return false; +inline bool IsContainedIn(const TInterval& x, const TInterval& y) { + if (x.contig() != y.contig()) + return false; return (x.i1 >= y.i1 && x.i2 <= y.i2 && x.i1 < y.i2 && x.i2 > y.i1); } -template -inline bool IsNearTheStartOf(const TInterval &x, const TInterval &y, int wiggleroom) -{ +template +inline bool IsNearTheStartOf(const TInterval& x, const TInterval& y, int wiggleroom) { // return true if x is near the start of y - if (x.i1 < y.i1 + wiggleroom) return true; - else return false; + if (x.i1 < y.i1 + wiggleroom) + return true; + else + return false; } - -template -inline bool IsNearTheEndOf(const TInterval &x, const TInterval &y, int wiggleroom) -{ +template +inline bool IsNearTheEndOf(const TInterval& x, const TInterval& y, int wiggleroom) { // return true if x is near the end of y - if (x.i2 > y.i2 - wiggleroom) return true; - else return false; + if (x.i2 > y.i2 - wiggleroom) + return true; + else + return false; } - template -inline TInterval Intersect(const TInterval &x, const TInterval &y) -{ +inline TInterval Intersect(const TInterval& x, const TInterval& y) { // return a 0 length interval if non-overlap TInterval ret; ret.i1 = std::max(x.i1, y.i1); ret.i2 = std::min(x.i2, y.i2); if (ret.i2 <= ret.i1) { - return TInterval(ret.i1, ret.i1, x.cargo); - } else{ + return TInterval(ret.i1, ret.i1, x.cargo); + } else { ret.cargo = x.cargo; return ret; } } -template -inline neusomatic::bio::GenomicInterval Intersect(const neusomatic::bio::GenomicInterval &x, const neusomatic::bio::GenomicInterval &y) -{ +template +inline neusomatic::bio::GenomicInterval Intersect( + const neusomatic::bio::GenomicInterval& x, + const neusomatic::bio::GenomicInterval& y) { // return a 0 length interval if non-overlap neusomatic::bio::GenomicInterval ret; - if (x.contig() != y.contig()) return ret; + if (x.contig() != y.contig()) + return ret; ret.left() = std::max(x.i1, y.i1); ret.right() = std::min(x.i2, y.i2); - if (ret.i2 <= ret.i1) ret.i2 = ret.i1; + if (ret.i2 <= ret.i1) + ret.i2 = ret.i1; return ret; } -template -inline std::vector FindIntervals(const TIntervalTree& inv_tree, - const std::vector& invs){ +template +inline std::vector FindIntervals(const TIntervalTree& inv_tree, + const std::vector& invs) { std::vector results; for (auto i = invs.cbegin(); i != invs.cend(); ++i) { - seqan::String overlapped_invs; seqan::findIntervals(overlapped_invs, inv_tree, i->left(), i->right()); @@ -472,8 +468,8 @@ inline decltype(auto) SplitByContig(const std::vector invs) { using TContig = typename TInterval::TContig; std::map> result; for (auto it = invs.cbegin(); it != invs.cend(); ++it) { - auto search = result.find(it->contig()); - if (search == result.end()) { + auto search = result.find(it->contig()); + if (search == result.end()) { std::vector invs_for_one_ctg; invs_for_one_ctg.push_back(*it); result.emplace(it->contig(), invs_for_one_ctg); @@ -486,11 +482,12 @@ inline decltype(auto) SplitByContig(const std::vector invs) { template inline auto Length(const TInterval& inv) { - if (inv.i1 > inv.i2) return 0; + if (inv.i1 > inv.i2) + return 0; return inv.i2 - inv.i1; } -}//namespace bio -}//namespace neusomatic +} // namespace bio +} // namespace neusomatic #endif /* INTERVAL_HPP */ diff --git a/neusomatic/include/Options.h b/neusomatic/include/Options.h index 727c811..7334c85 100644 --- a/neusomatic/include/Options.h +++ b/neusomatic/include/Options.h @@ -2,230 +2,266 @@ #ifndef LIB_INCLUDE_OPTIONS_H_ #define LIB_INCLUDE_OPTIONS_H_ +#include + #include #include #include -#include namespace neusomatic { - static struct option long_options[] = { - {"verbosity", required_argument, 0, 'v'}, - {"bam", required_argument, 0, 'b'}, - {"bed", required_argument, 0, 'L'}, - {"ref", required_argument, 0, 'r'}, - {"calculate_qual_stat", no_argument, 0, 'c'}, - {"min_mapq", required_argument, 0, 'q'}, - {"snp_min_bq", required_argument, 0, 'n'}, - {"snp_min_af", required_argument, 0, 'a'}, - {"ins_min_af", required_argument, 0, 'i'}, - {"del_min_af", required_argument, 0, 'l'}, - {"snp_min_ao", required_argument, 0, 'M'}, - {"out_vcf_file", required_argument, 0, 'f'}, - {"out_count_file", required_argument, 0, 'o'}, - {"fully_contained", no_argument, 0, 'y'}, - {"window_size", required_argument, 0, 'w'}, - {"num_threads", required_argument, 0, 't'}, - {"max_depth", required_argument, 0, 'd'}, - {"min_depth", required_argument, 0, 'h'}, - {"include_secondary", no_argument, 0, 's'}, - {"filter_duplicate", no_argument, 0, 'D'}, - {"filter_QCfailed", no_argument, 0, 'Q'}, - {"filter_improper_pair", no_argument, 0, 'E'}, - {"filter_mate_unmapped", no_argument, 0, 'F'}, - {"filter_improper_orientation", no_argument, 0, 'G'}, - {"report_all_alleles", no_argument, 0, 'A'}, - {"report_count_for_all_positions", no_argument, 0, 'C'}, - {0, 0, 0, 0} // terminator - }; - - const char *short_options = "v:b:L:r:q:f:o:w:a:t:d:cysDQEFG"; - - void print_help() - { - std::cerr<< "---------------------------------------------------\n"; - std::cerr<< "Usage: scan_alignments [options] -b BAM -L BED -r REF\n"; - std::cerr<< "General Options:\n"; - std::cerr<< "-v/--verbosity, verbosity level from 0 to 9, default 0.\n"; - std::cerr<< "-b/--bam, bam file path, required.\n"; - std::cerr<< "-L/--bed, targeted region in bam format, required.\n"; - std::cerr<< "-r/--ref, reference file path, required.\n"; - std::cerr<< "-c/--calculate_qual_stat, calculating base quality and other stats, default False.\n"; - std::cerr<< "-q/--min_mapq, minimum mapping quality, default 0.\n"; - std::cerr<< "-n/--snp_min_bq, SNP minimum base quality, default 10.\n"; - std::cerr<< "-a/--snp_min_af, SNP minimum allele freq, default 0.01.\n"; - std::cerr<< "-i/--ins_min_af, INS minimum allele freq, default 0.01.\n"; - std::cerr<< "-l/--del_min_af, DEL minimum allele freq, default 0.01.\n"; - std::cerr<< "-M/--snp_min_ao, SNP minimum alternate count for low AF candidates, default 3.\n"; - std::cerr<< "-f/--out_vcf_file, output vcf file path, required.\n"; - std::cerr<< "-o/--out_count_file, output count file path, required.\n"; - std::cerr<< "-w/--window_size, window size to scan the variants, default is 15.\n"; - std::cerr<< "-y/--fully_contained, if this option is on. A read has to be fully contained in the region, default is False.\n"; - // std::cerr<< "-t/--num_threads, number or thread used for building the count matrix, default is 4.\n"; - std::cerr<< "-d/--max_depth, maximum depth for building the count matrix, default is 40,000.\n"; - std::cerr<< "-h/--min_depth, minimum depth for building the count matrix, default is 0.\n"; - std::cerr<< "-s/--include_secondary, consider secondary alignments, default is False.\n"; - std::cerr<< "-D/--filter_duplicate, filter duplicate reads if the flag is set, default is False.\n"; - std::cerr<< "-Q/--filter_QCfailed, filter QC failed reads if the flag is set, default is False.\n"; - std::cerr<< "-E/--filter_improper_pair, filter improper pairs if the flag is set, default is False.\n"; - std::cerr<< "-F/--filter_mate_unmapped, filter reads with unmapeed mates if the flag is set, default is False.\n"; - std::cerr<< "-G/--filter_improper_orientation, filter reads with improper orientation (not FR) or different chrom, default is False.\n"; - std::cerr<< "-A/--report_all_alleles, report all alleles per column, default is False.\n"; - std::cerr<< "-C/--report_count_for_all_positions, report counts for all positions, default is False.\n"; - } - - int parseInt(const char* optarg, int lower, const char *errmsg, void (*print_help)()) { - long l; - char *endPtr= NULL; - l = strtol(optarg, &endPtr, 10); - if (endPtr != NULL) { - if (l < lower ) { - std::cerr << errmsg << std::endl; - print_help(); - exit(1); - } - return (int32_t)l; - } - std::cerr<< errmsg < upper) - { - std::cerr << errmsg << std::endl; - print_help(); - exit(1); + std::cerr << errmsg << std::endl; + print_help(); + exit(1); } + return (int32_t)l; + } + std::cerr << errmsg << std::endl; + print_help(); + exit(1); + return -1; +} - return l; +float parseFloat(const char* optarg, float lower, float upper, const char* errmsg, + void (*print_help)()) { + float l; + l = (float)atof(optarg); + if (l < lower) { std::cerr << errmsg << std::endl; print_help(); exit(1); - return -1; } - template - int parse_options(int argc, char* argv[], Options& opt) { - int option_index = 0; - int next_option; - do { - next_option = getopt_long(argc, argv, short_options, long_options, &option_index); - switch(next_option) { - case -1: - break; - case 'v': - opt.verbosity() = parseInt(optarg, 0, "-v/--verbosity must be at least 0", print_help); - break; - case 'b': - opt.bam_in() = optarg; - break; - case 'L': - opt.target_region_in() = optarg; - break; - case 'r': - opt.ref() = optarg; - break; - case 'c': - opt.calculate_qual_stat() = true; - break; - case 'q': - opt.min_mapq() = parseInt(optarg, 0, "-q/--min_mapq must be at least 0", print_help); - break; - case 'n': - opt.snp_min_bq() = parseInt(optarg, 0, "-n/--snp_min_bq must be at least 0", print_help); - break; - case 'f': - opt.vcf_out() = optarg; - break; - case 'd': - opt.max_depth() = parseInt(optarg, 1, "-d/--max_depth must be at least 1", print_help); - break; - case 'h': - opt.min_depth() = parseInt(optarg, 1, "-h/--min_depth must be at least 1", print_help); - break; - case 'o': - opt.count_out() = optarg; - break; - case 'w': - opt.window_size() = parseInt(optarg, 10, "-k/--window_size must be at least 10", print_help); - break; - case 'y': - opt.fully_contained() = true; - break; - case 's': - opt.include_secondary() = true; - break; - case 'D': - opt.filter_duplicate() = true; - break; - case 'Q': - opt.filter_QCfailed() = true; - break; - case 'E': - opt.filter_improper_pair() = true; - break; - case 'F': - opt.filter_mate_unmapped() = true; - break; - case 'G': - opt.filter_improper_orientation() = true; - break; - case 'A': - opt.report_all_alleles() = true; - break; - case 'C': - opt.report_count_for_all_positions() = true; - break; - case 'a': - opt.snp_min_allele_freq() = parseFloat(optarg, 0.0, 1.0, "-a/--snp_min_af must be between 0 and 1", print_help); - break; - case 'i': - opt.ins_min_allele_freq() = parseFloat(optarg, 0.0, 1.0, "-i/--ins_min_af must be between 0 and 1", print_help); - break; - case 'l': - opt.del_min_allele_freq() = parseFloat(optarg, 0.0, 1.0, "-l/--del_min_af must be between 0 and 1", print_help); - break; - case 'M': - opt.snp_min_ao() = parseInt(optarg, 1, "-M/--snp_min_ao must be at least 1", print_help); - break; - case 't': - //opt.num_threads() = parseInt(optarg, 1, "-t/--num_threads must be at least 1", print_help); - break; - default: - return 1; - } - } while (next_option != -1); - - return 0; + if (l > upper) { + std::cerr << errmsg << std::endl; + print_help(); + exit(1); } + return l; + + std::cerr << errmsg << std::endl; + print_help(); + exit(1); + return -1; +} + +template +int parse_options(int argc, char* argv[], Options& opt) { + int option_index = 0; + int next_option; + do { + next_option = getopt_long(argc, argv, short_options, long_options, &option_index); + switch (next_option) { + case -1: + break; + case 'v': + opt.verbosity() = + parseInt(optarg, 0, "-v/--verbosity must be at least 0", print_help); + break; + case 'b': + opt.bam_in() = optarg; + break; + case 'L': + opt.target_region_in() = optarg; + break; + case 'r': + opt.ref() = optarg; + break; + case 'c': + opt.calculate_qual_stat() = true; + break; + case 'q': + opt.min_mapq() = + parseInt(optarg, 0, "-q/--min_mapq must be at least 0", print_help); + break; + case 'n': + opt.snp_min_bq() = + parseInt(optarg, 0, "-n/--snp_min_bq must be at least 0", print_help); + break; + case 'f': + opt.vcf_out() = optarg; + break; + case 'd': + opt.max_depth() = + parseInt(optarg, 1, "-d/--max_depth must be at least 1", print_help); + break; + case 'h': + opt.min_depth() = + parseInt(optarg, 1, "-h/--min_depth must be at least 1", print_help); + break; + case 'o': + opt.count_out() = optarg; + break; + case 'w': + opt.window_size() = + parseInt(optarg, 10, "-k/--window_size must be at least 10", print_help); + break; + case 'y': + opt.fully_contained() = true; + break; + case 's': + opt.include_secondary() = true; + break; + case 'D': + opt.filter_duplicate() = true; + break; + case 'Q': + opt.filter_QCfailed() = true; + break; + case 'E': + opt.filter_improper_pair() = true; + break; + case 'F': + opt.filter_mate_unmapped() = true; + break; + case 'G': + opt.filter_improper_orientation() = true; + break; + case 'A': + opt.report_all_alleles() = true; + break; + case 'C': + opt.report_count_for_all_positions() = true; + break; + case 'a': + opt.snp_min_allele_freq() = parseFloat( + optarg, 0.0, 1.0, "-a/--snp_min_af must be between 0 and 1", print_help); + break; + case 'i': + opt.ins_min_allele_freq() = parseFloat( + optarg, 0.0, 1.0, "-i/--ins_min_af must be between 0 and 1", print_help); + break; + case 'l': + opt.del_min_allele_freq() = parseFloat( + optarg, 0.0, 1.0, "-l/--del_min_af must be between 0 and 1", print_help); + break; + case 'M': + opt.snp_min_ao() = + parseInt(optarg, 1, "-M/--snp_min_ao must be at least 1", print_help); + break; + case 't': + // opt.num_threads() = parseInt(optarg, 1, "-t/--num_threads must be at least 1", + // print_help); + break; + default: + return 1; + } + } while (next_option != -1); + + return 0; +} + struct Options { Options(const int argc, char* argv[]) { - std::string cmdline; - for(int i=0; ibam_in_.empty()) { @@ -251,150 +287,84 @@ struct Options { } } - decltype(auto) verbosity() { - return (verbosity_); - } + decltype(auto) verbosity() { return (verbosity_); } - decltype(auto) bam_in() { - return (bam_in_); - } + decltype(auto) bam_in() { return (bam_in_); } - decltype(auto) num_threads() { - return (num_threads_); - } + decltype(auto) num_threads() { return (num_threads_); } - //decltype(auto) bam_out() const { - //return bam_out_; + // decltype(auto) bam_out() const { + // return bam_out_; //} - decltype(auto) vcf_out() { - return (vcf_out_); - } - - decltype(auto) count_out() { - return (count_out_); - } - - decltype(auto) target_region_in() { - return (target_region_in_); - } + decltype(auto) vcf_out() { return (vcf_out_); } + + decltype(auto) count_out() { return (count_out_); } - //const bool is_output_bam() const { - //if (bam_out_.empty()) return false; - //else return true; + decltype(auto) target_region_in() { return (target_region_in_); } + + // const bool is_output_bam() const { + // if (bam_out_.empty()) return false; + // else return true; //} - decltype(auto) ref() { - return (ref_); - } + decltype(auto) ref() { return (ref_); } - decltype(auto) min_mapq() const { - return (min_mapq_); - } + decltype(auto) min_mapq() const { return (min_mapq_); } - decltype(auto) snp_min_bq() const { - return (snp_min_bq_); - } + decltype(auto) snp_min_bq() const { return (snp_min_bq_); } - decltype(auto) min_mapq() { - return (min_mapq_); - } + decltype(auto) min_mapq() { return (min_mapq_); } - decltype(auto) snp_min_bq() { - return (snp_min_bq_); - } + decltype(auto) snp_min_bq() { return (snp_min_bq_); } - decltype(auto) calculate_qual_stat() { - return (calculate_qual_stat_); - } + decltype(auto) calculate_qual_stat() { return (calculate_qual_stat_); } - decltype(auto) window_size() { - return (window_size_); - } + decltype(auto) window_size() { return (window_size_); } - decltype(auto) snp_min_allele_freq() { - return (snp_min_allele_freq_); - } + decltype(auto) snp_min_allele_freq() { return (snp_min_allele_freq_); } - decltype(auto) ins_min_allele_freq() { - return (ins_min_allele_freq_); - } + decltype(auto) ins_min_allele_freq() { return (ins_min_allele_freq_); } - decltype(auto) del_min_allele_freq() { - return (del_min_allele_freq_); - } + decltype(auto) del_min_allele_freq() { return (del_min_allele_freq_); } - decltype(auto) snp_min_ao() { - return (snp_min_ao_); - } + decltype(auto) snp_min_ao() { return (snp_min_ao_); } - decltype(auto) fully_contained() { - return (fully_contained_); - } + decltype(auto) fully_contained() { return (fully_contained_); } - decltype(auto) max_depth() { - return (max_depth_); - } + decltype(auto) max_depth() { return (max_depth_); } - decltype(auto) min_depth() { - return (min_depth_); - } + decltype(auto) min_depth() { return (min_depth_); } - decltype(auto) include_secondary() { - return (include_secondary_); - } + decltype(auto) include_secondary() { return (include_secondary_); } - decltype(auto) include_secondary() const { - return (include_secondary_); - } + decltype(auto) include_secondary() const { return (include_secondary_); } - decltype(auto) filter_duplicate() const { - return (filter_duplicate_); - } + decltype(auto) filter_duplicate() const { return (filter_duplicate_); } - decltype(auto) filter_duplicate() { - return (filter_duplicate_); - } + decltype(auto) filter_duplicate() { return (filter_duplicate_); } - decltype(auto) filter_QCfailed() const { - return (filter_QCfailed_); - } + decltype(auto) filter_QCfailed() const { return (filter_QCfailed_); } - decltype(auto) filter_QCfailed() { - return (filter_QCfailed_); - } + decltype(auto) filter_QCfailed() { return (filter_QCfailed_); } - decltype(auto) filter_improper_pair() const { - return (filter_improper_pair_); - } + decltype(auto) filter_improper_pair() const { return (filter_improper_pair_); } - decltype(auto) filter_improper_pair() { - return (filter_improper_pair_); - } + decltype(auto) filter_improper_pair() { return (filter_improper_pair_); } - decltype(auto) filter_mate_unmapped() const { - return (filter_mate_unmapped_); - } + decltype(auto) filter_mate_unmapped() const { return (filter_mate_unmapped_); } - decltype(auto) filter_mate_unmapped() { - return (filter_mate_unmapped_); - } + decltype(auto) filter_mate_unmapped() { return (filter_mate_unmapped_); } decltype(auto) filter_improper_orientation() const { return (filter_improper_orientation_); } - decltype(auto) filter_improper_orientation() { - return (filter_improper_orientation_); - } + decltype(auto) filter_improper_orientation() { return (filter_improper_orientation_); } - decltype(auto) report_all_alleles() const { - return (report_all_alleles_); - } + decltype(auto) report_all_alleles() const { return (report_all_alleles_); } - decltype(auto) report_all_alleles() { - return (report_all_alleles_); - } + decltype(auto) report_all_alleles() { return (report_all_alleles_); } decltype(auto) report_count_for_all_positions() const { return (report_count_for_all_positions_); @@ -433,8 +403,6 @@ struct Options { bool report_all_alleles_ = false; bool report_count_for_all_positions_ = false; }; -}//namespace neusomatic - - +} // namespace neusomatic #endif /* LIB_INCLUDE_OPTIONS_H_ */ diff --git a/neusomatic/include/RefSeqs.h b/neusomatic/include/RefSeqs.h index 446faac..35049b7 100644 --- a/neusomatic/include/RefSeqs.h +++ b/neusomatic/include/RefSeqs.h @@ -1,58 +1,52 @@ #ifndef CPPUTIL_BIO_REF_SEQS_H #define CPPUTIL_BIO_REF_SEQS_H -#include +#include + #include #include -#include +#include -namespace neusomatic { namespace bio { +namespace neusomatic { +namespace bio { class RefSeqs { public: - using Seq = seqan::Dna5String; // can templatize for other strings for the future + using Seq = seqan::Dna5String; // can templatize for other strings for the future explicit RefSeqs(std::string const& file_name, const bool load_seq = false) { load(file_name); } - decltype(auto) operator[](size_t rid) const { - return (rid_seq_[rid]); - } + decltype(auto) operator[](size_t rid) const { return (rid_seq_[rid]); } - decltype(auto) GetName(size_t rid) const { - return (rid_str_[rid]); - } + decltype(auto) GetName(size_t rid) const { return (rid_str_[rid]); } - decltype(auto) GetNames() const { - return (rid_str_); - } + decltype(auto) GetNames() const { return (rid_str_); } - template + template decltype(auto) GetId(Str const& str) const { const auto& itr = str_rid_.find(str); if (itr == str_rid_.end()) { - throw std::runtime_error( str + " not found in reference database"); + throw std::runtime_error(str + " not found in reference database"); } return itr->second; } - auto size() const { - return rid_seq_.size(); - } + auto size() const { return rid_seq_.size(); } - template + template decltype(auto) operator[](Str const& str) const { const auto itr = str_rid_.find(str); if (itr == str_rid_.end()) { - throw std::runtime_error( str + " not found in reference database"); + throw std::runtime_error(str + " not found in reference database"); } - return rid_seq_[ itr->second ]; + return rid_seq_[itr->second]; } void load(std::string const& file_name, const bool load_seq = false) { seqan::FaiIndex fai; if (not open(fai, file_name.c_str())) { - throw std::runtime_error("cannot find index file for "+file_name); + throw std::runtime_error("cannot find index file for " + file_name); }; rid_seq_.resize(numSeqs(fai)); @@ -60,7 +54,7 @@ class RefSeqs { str_rid_.clear(); for (size_t rid = 0; rid < numSeqs(fai); ++rid) { if (load_seq) { - readSequence(rid_seq_[rid], fai, rid); // because this is not thread-safe + readSequence(rid_seq_[rid], fai, rid); // because this is not thread-safe } rid_str_[rid] = seqan::sequenceName(fai, rid); str_rid_[rid_str_[rid]] = rid; @@ -74,8 +68,7 @@ class RefSeqs { std::map str_rid_; }; -} //namespace bio -} //namespace neusomatic - +} // namespace bio +} // namespace neusomatic #endif diff --git a/neusomatic/include/SeqanUtils.h b/neusomatic/include/SeqanUtils.h index 70f7ba7..8d69c7a 100644 --- a/neusomatic/include/SeqanUtils.h +++ b/neusomatic/include/SeqanUtils.h @@ -1,81 +1,85 @@ #ifndef SEQAN_UTILS_HPP #define SEQAN_UTILS_HPP +#include +#include +#include + #include -#include -#include -#include - -namespace neusomatic{ namespace bio{ - -template -std::string Base2String(const Base& base, const std::string& gap_symbol){ - - //std::cout<(base)<<" :static cast: "; - //std::cout<<(unsigned)seqan::ordValue(base)< +std::string Base2String(const Base& base, const std::string& gap_symbol) { + // std::cout<(base)<<" :static cast: "; + // std::cout<<(unsigned)seqan::ordValue(base)< +template <> std::string Base2String(const char& base, const std::string& gap_symbol) { - switch(base) { - case 'a': - case 'A': - case 0: - return "A"; - - case 'c': - case 'C': - case 1: - return "C"; - - case 'g': - case 'G': - case 2: - return "G"; - case 't': - case 'T': - case 3: - return "T"; - default: - return gap_symbol; + switch (base) { + case 'a': + case 'A': + case 0: + return "A"; + + case 'c': + case 'C': + case 1: + return "C"; + + case 'g': + case 'G': + case 2: + return "G"; + case 't': + case 'T': + case 3: + return "T"; + default: + return gap_symbol; } } -template -std::string to_string(Itr b, Itr e, const std::string& gap_symbol = "N"){ +template +std::string to_string(Itr b, Itr e, const std::string& gap_symbol = "N") { std::string result; auto it = b; - for(; it < e; ++it){ + for (; it < e; ++it) { result += Base2String(*it, gap_symbol); } return result; } -inline std::string to_string(const seqan::Dna5String& seq, const std::string& gap_symbol = "N"){ +inline std::string to_string(const seqan::Dna5String& seq, + const std::string& gap_symbol = "N") { std::string result; for (unsigned int i = 0; i < seqan::length(seq); ++i) { - result += Base2String(seq[i], gap_symbol); + result += Base2String(seq[i], gap_symbol); } return result; } -template -inline decltype(auto) ToSeqanIntervals(const std::vector& invs){ - +template +inline decltype(auto) ToSeqanIntervals(const std::vector& invs) { typedef seqan::IntervalAndCargo TIntervalAndCargo; - std::vector seqan_intervals; + std::vector seqan_intervals; for (auto it = invs.cbegin(); it != invs.cend(); ++it) { TIntervalAndCargo i(it->left(), it->right(), *it); seqan_intervals.push_back(i); @@ -83,9 +87,11 @@ inline decltype(auto) ToSeqanIntervals(const std::vector& invs){ return seqan_intervals; } -inline std::vector SeqanSplit(seqan::CharString target, std::string splitter) { +inline std::vector SeqanSplit(seqan::CharString target, + std::string splitter) { /* - Return a vec of size 1 containing the intact target if splitter is not found in the target. + Return a vec of size 1 containing the intact target if splitter is not found in the + target. */ if (seqan::length(target) == 0 || seqan::length(splitter) == 0) { throw std::runtime_error("Invalid string and pattern to split."); @@ -101,9 +107,10 @@ inline std::vector SeqanSplit(seqan::CharString target, std:: } TPos lastp = 0; - for (auto const& p: searches) { + for (auto const& p : searches) { seqan::CharString unit = seqan::infix(target, lastp, p.first); - if (!seqan::empty(unit)) result.push_back(unit); + if (!seqan::empty(unit)) + result.push_back(unit); lastp = p.second; } if (lastp < seqan::length(target) - 1) { @@ -113,7 +120,7 @@ inline std::vector SeqanSplit(seqan::CharString target, std:: return result; } -}// namespace neusomatic -}// namespace bio +} // namespace bio +} // namespace neusomatic #endif /* SEQAN_UTILS_HPP */ diff --git a/neusomatic/include/bam_variant.hpp b/neusomatic/include/bam_variant.hpp index 7ac5057..999d977 100644 --- a/neusomatic/include/bam_variant.hpp +++ b/neusomatic/include/bam_variant.hpp @@ -1,38 +1,38 @@ #ifndef BAM_VARIANT_HPP #define BAM_VARIANT_HPP -#include #include +#include + #include "variant.hpp" -namespace neusomatic{ +namespace neusomatic { -inline bool IsNuc( char c ) -{ - static const char alpha[] = "ACGTRYKMSWBDHVN"; - return ( std::strchr( alpha, c ) != NULL ); +inline bool IsNuc(char c) { + static const char alpha[] = "ACGTRYKMSWBDHVN"; + return (std::strchr(alpha, c) != NULL); } -inline bool IsNumber(const std::string& s) -{ - return !s.empty() && std::find_if(s.begin(), s.end(), [](char c) { return !std::isdigit(c); }) == s.end(); -} +inline bool IsNumber(const std::string& s) { + return !s.empty() && std::find_if(s.begin(), s.end(), + [](char c) { return !std::isdigit(c); }) == s.end(); +} enum class MDType { MATCH = 0, SUB, DEL }; -inline std::ostream& operator<<(std::ostream & os, MDType & mdt) { - switch(mdt) { - case MDType::MATCH: - os << 'M'; - break; - case MDType::SUB: - os << 'X'; - break; - case MDType::DEL: - os << 'D'; - break; - default: - os << "ERROR"; - break; +inline std::ostream& operator<<(std::ostream& os, MDType& mdt) { + switch (mdt) { + case MDType::MATCH: + os << 'M'; + break; + case MDType::SUB: + os << 'X'; + break; + case MDType::DEL: + os << 'D'; + break; + default: + os << "ERROR"; + break; } return os; } @@ -42,44 +42,47 @@ class MDTags { const std::string tag_string_; int16_t curr_pos_; size_t size; + public: - MDTags(const std::string& tags) : tag_string_(tags), curr_pos_(0), size(tags.length()) { - } + MDTags(const std::string& tags) + : tag_string_(tags), curr_pos_(0), size(tags.length()) {} - bool Next(std::pair& result) { - if (curr_pos_ == size) {return false;} - if (std::isdigit(tag_string_[curr_pos_])) { // match + bool Next(std::pair& result) { + if (curr_pos_ == size) { + return false; + } + if (std::isdigit(tag_string_[curr_pos_])) { // match if (tag_string_[curr_pos_] == '0') { - curr_pos_ ++; - } - else { + curr_pos_++; + } else { int len = 0; while (curr_pos_ + len < size && std::isdigit(tag_string_[curr_pos_ + len])) { ++len; - } + } result = std::make_pair(MDType::MATCH, tag_string_.substr(curr_pos_, len)); - curr_pos_ += len ; + curr_pos_ += len; return true; } - } + } - if (tag_string_[curr_pos_] == '^') { // deletion + if (tag_string_[curr_pos_] == '^') { // deletion int len = 0; ++curr_pos_; while (curr_pos_ + len < size && IsNuc(tag_string_[curr_pos_ + len])) { ++len; - } + } result = std::make_pair(MDType::DEL, tag_string_.substr(curr_pos_, len)); curr_pos_ += len; return true; } - if (IsNuc(tag_string_[curr_pos_])) { // substitution + if (IsNuc(tag_string_[curr_pos_])) { // substitution int len = 0; std::string s; - while (curr_pos_ + len < size && (IsNuc(tag_string_[curr_pos_ + len]) || tag_string_[curr_pos_ + len] == '0')) { + while (curr_pos_ + len < size && (IsNuc(tag_string_[curr_pos_ + len]) || + tag_string_[curr_pos_ + len] == '0')) { if (IsNuc(tag_string_[curr_pos_ + len])) { - s += tag_string_.substr(curr_pos_ + len, 1); + s += tag_string_.substr(curr_pos_ + len, 1); } ++len; } @@ -92,58 +95,61 @@ class MDTags { } }; -template -std::vector> GetIndels(const BamRecord& bam) { +template +std::vector> GetIndels( + const BamRecord& bam) { std::vector> result; - if (bam.Position() < 0) return result; - auto const& cigar = bam.GetCigar(); + if (bam.Position() < 0) + return result; + auto const& cigar = bam.GetCigar(); auto const& seq = bam.Sequence(); int32_t ref_pos = bam.Position(); int32_t read_pos = bam.AlignmentPosition(); for (auto c = cigar.begin(); c != cigar.end(); ++c) { std::string s; switch (c->Type()) { - case 'H': - if (c == cigar.begin()){ - read_pos -= c->Length(); - } - break; - case 'M': - ref_pos += c->Length(); - read_pos += c->Length(); - break; - case 'I': - s = bam.Sequence().substr(read_pos, c->Length()); - result.emplace_back(bam.ChrID(), ref_pos, 0, s); - read_pos += c->Length(); - break; - case 'D': - result.emplace_back(bam.ChrID(), ref_pos, c->Length(), s); - ref_pos += c->Length(); - break; - default: - break; + case 'H': + if (c == cigar.begin()) { + read_pos -= c->Length(); + } + break; + case 'M': + ref_pos += c->Length(); + read_pos += c->Length(); + break; + case 'I': + s = bam.Sequence().substr(read_pos, c->Length()); + result.emplace_back(bam.ChrID(), ref_pos, 0, s); + read_pos += c->Length(); + break; + case 'D': + result.emplace_back(bam.ChrID(), ref_pos, c->Length(), s); + ref_pos += c->Length(); + break; + default: + break; } } return result; } -template +template std::vector> GetSNVs(const BamRecord& bam) { std::vector> result; std::string mdstr; bool status = bam.GetZTag("MD", mdstr); if (!status) { - std::cerr<<"Warning: no MD tag!\n"; + std::cerr << "Warning: no MD tag!\n"; return result; } if (IsNumber(mdstr)) { return result; } - std::transform(mdstr.begin(), mdstr.end(),mdstr.begin(), ::toupper); - if (bam.Position() < 0) return result; - auto const& cigar = bam.GetCigar(); + std::transform(mdstr.begin(), mdstr.end(), mdstr.begin(), ::toupper); + if (bam.Position() < 0) + return result; + auto const& cigar = bam.GetCigar(); auto const& seq = bam.Sequence(); int32_t ref_pos = bam.Position(); int32_t read_pos = bam.AlignmentPosition(); @@ -154,39 +160,41 @@ std::vector> GetSNVs(const BamReco std::vector> snvs_MDTags; int sum_M_len_before_D = 0; int match_len = 0; - int ref_consumer = 0; + int ref_consumer = 0; for (auto c = cigar.begin(); c != cigar.end(); ++c) { match_len = 0; - ref_consumer = 0; + ref_consumer = 0; switch (c->Type()) { - case 'M': - sum_M_len_before_D += c->Length(); - break; - case 'D': - while (md.Next(unit)) { - if (unit.first == MDType::MATCH) { - match_len = stoi(unit.second); - ref_pos += match_len; - ref_consumer += match_len; - } - if (unit.first == MDType::SUB) { - snvs_MDTags.emplace_back(bam.ChrID(), ref_pos, unit.second.length(), unit.second); - ref_consumer += unit.second.length(); - ref_pos += unit.second.length(); - } - if (ref_consumer == sum_M_len_before_D) break; + case 'M': + sum_M_len_before_D += c->Length(); + break; + case 'D': + while (md.Next(unit)) { + if (unit.first == MDType::MATCH) { + match_len = stoi(unit.second); + ref_pos += match_len; + ref_consumer += match_len; } - sum_M_len_before_D = 0; - ref_pos += c->Length(); - break; - default: - break; + if (unit.first == MDType::SUB) { + snvs_MDTags.emplace_back(bam.ChrID(), ref_pos, unit.second.length(), + unit.second); + ref_consumer += unit.second.length(); + ref_pos += unit.second.length(); + } + if (ref_consumer == sum_M_len_before_D) + break; + } + sum_M_len_before_D = 0; + ref_pos += c->Length(); + break; + default: + break; } } - ref_consumer = 0; + ref_consumer = 0; while (md.Next(unit)) { if (unit.first == MDType::MATCH) { match_len = stoi(unit.second); @@ -198,19 +206,20 @@ std::vector> GetSNVs(const BamReco ref_consumer += unit.second.length(); ref_pos += unit.second.length(); } - if (ref_consumer == sum_M_len_before_D) break; + if (ref_consumer == sum_M_len_before_D) + break; } ref_pos = bam.Position(); read_pos = bam.AlignmentPosition(); auto b = cigar.begin(); if (b->Type() == 'H') { - if (b == cigar.begin()){ - read_pos -= b->Length(); + if (b == cigar.begin()) { + read_pos -= b->Length(); } } for (const auto& t : snvs_MDTags) { - const int32_t ref_consume_goal = t.left() - ref_pos; + const int32_t ref_consume_goal = t.left() - ref_pos; int32_t ref_consume = 0; int32_t read_consume = 0; while (ref_consume < ref_consume_goal) { @@ -225,25 +234,26 @@ std::vector> GetSNVs(const BamReco ++b; } int32_t read_shift = read_consume - (ref_consume - ref_consume_goal); - if (b != cigar.end() && b->Type() == 'I' && ref_consume == ref_consume_goal) { // SUB after an INS + if (b != cigar.end() && b->Type() == 'I' && + ref_consume == ref_consume_goal) { // SUB after an INS read_shift += b->Length(); } - std::string s = seq.substr(read_pos + read_shift, t.allele().length()); + std::string s = seq.substr(read_pos + read_shift, t.allele().length()); result.emplace_back(bam.ChrID(), t.left(), t.length(), s); - read_pos += read_consume; + read_pos += read_consume; ref_pos += ref_consume; } return result; } -template +template std::vector> GetVars(const BamRecord& bam) { auto results = GetIndels(bam); auto snvs = GetSNVs(bam); results.insert(results.end(), snvs.begin(), snvs.end()); - //std::sort(results.begin(), results.end()); + // std::sort(results.begin(), results.end()); return results; } -}/* ALLAMERICAN_VARIANT_HPP */ -#endif +} // namespace neusomatic +#endif diff --git a/neusomatic/include/bedio.hpp b/neusomatic/include/bedio.hpp index b0bdd2d..8cf6f4c 100644 --- a/neusomatic/include/bedio.hpp +++ b/neusomatic/include/bedio.hpp @@ -1,26 +1,22 @@ #ifndef LIB_INCLUDE_BEDIO_HPP #define LIB_INCLUDE_BEDIO_HPP +#include -#include -#include -#include -#include "Interval.hpp" - +#include +#include -namespace neusomatic{ +#include "Interval.hpp" +namespace neusomatic { -template +template class BedIO { public: + BedIO(const std::string& filepath) : filepath_(filepath) {} - BedIO(const std::string& filepath): filepath_(filepath) { - - } - - static void WriteToBed3(const std::vector& invs, seqan::BedFileOut& out){ - for(const auto& inv: invs) { + static void WriteToBed3(const std::vector& invs, seqan::BedFileOut& out) { + for (const auto& inv : invs) { seqan::BedRecord record; record.beginPos = inv.left(); record.endPos = inv.right(); @@ -30,55 +26,56 @@ class BedIO { } std::vector ReadBed3() { - seqan::BedFileIn bed_in_(filepath_.c_str()); + seqan::BedFileIn bed_in_(filepath_.c_str()); if (seqan::atEnd(bed_in_)) { - throw std::runtime_error("It seems to be a corrupted or wrong bed file: " + filepath_); + throw std::runtime_error("It seems to be a corrupted or wrong bed file: " + + filepath_); } std::vector result; - while (!seqan::atEnd(bed_in_)){ + while (!seqan::atEnd(bed_in_)) { seqan::readRecord(record_, bed_in_); - //always left <= right + // always left <= right int l = std::min(record_.beginPos, record_.endPos); int r = std::max(record_.endPos, record_.endPos); auto refcontig = seqan::toCString(record_.ref); - Interval inv(refcontig, l ,r); + Interval inv(refcontig, l, r); result.push_back(inv); } return result; } - std::vector ReadBed3_windowed(int window_size) { seqan::BedFileIn bed_in_(filepath_.c_str()); if (seqan::atEnd(bed_in_)) { - throw std::runtime_error("It seems to be a corrupted or wrong bed file: " + filepath_); + throw std::runtime_error("It seems to be a corrupted or wrong bed file: " + + filepath_); } std::vector result; - while (!seqan::atEnd(bed_in_)){ + while (!seqan::atEnd(bed_in_)) { seqan::readRecord(record_, bed_in_); - //always left <= right + // always left <= right int l = std::min(record_.beginPos, record_.endPos); - int r = std::max(record_.endPos, record_.endPos)+1; - int span = r-l; + int r = std::max(record_.endPos, record_.endPos) + 1; + int span = r - l; auto refcontig = seqan::toCString(record_.ref); - for (int i=0; i<=span/window_size; i++){ - int l_2=std::max(l+i*window_size-1,l); - int r_2=std::min(r,l+(i+1)*window_size); - if ((r-r_2) ReadAndReduceBedInvs(){ + std::vector ReadAndReduceBedInvs() { std::vector result; auto splited_invs = ReadAndSplitBedInvs(); for (auto it = splited_invs.cbegin(); it != splited_invs.cend(); ++it) { @@ -93,7 +90,7 @@ class BedIO { return result; } - std::vector ReadAndDisjointBedInvs(){ + std::vector ReadAndDisjointBedInvs() { std::vector result; auto splited_invs = ReadAndSplitBedInvs(); for (auto it = splited_invs.cbegin(); it != splited_invs.cend(); ++it) { @@ -115,10 +112,10 @@ class BedIO { private: const std::string& filepath_; - //seqan::BedFileIn bed_in_; - seqan::BedRecord record_; //place holder + // seqan::BedFileIn bed_in_; + seqan::BedRecord record_; // place holder }; -}//namespace neusomatic +} // namespace neusomatic #endif /* LIB_INCLUDE_BEDIO_HPP */ diff --git a/neusomatic/include/change_coordinates.hpp b/neusomatic/include/change_coordinates.hpp index a126c1c..8f6e0b7 100644 --- a/neusomatic/include/change_coordinates.hpp +++ b/neusomatic/include/change_coordinates.hpp @@ -7,9 +7,10 @@ namespace neusomatic { class ChangeCoordinates { std::vector ungap_pos_; - std::vector gap_pos_; // reduce to the left-most position + std::vector gap_pos_; // reduce to the left-most position std::string ref_seq_; static const unsigned char gap_chr_ = '-'; + public: ChangeCoordinates(const std::string& refgap) { int i = 0; @@ -20,31 +21,23 @@ class ChangeCoordinates { for (auto it = begin(refgap); it != end(refgap); ++it, ++i) { if (*it == gap_chr_) { - ungap_pos_[i] = ungap_cursor; + ungap_pos_[i] = ungap_cursor; } else { gap_pos_.push_back(i); - ungap_pos_[i] = ungap_cursor; + ungap_pos_[i] = ungap_cursor; ref_seq_ += neusomatic::bio::Base2String(*it, "N"); ungap_cursor++; } } } - int UngapPos(const int gap_pos) const { - return ungap_pos_[gap_pos]; - } + int UngapPos(const int gap_pos) const { return ungap_pos_[gap_pos]; } - int GapPos(const int ref_pos) const { - return gap_pos_[ref_pos]; - } + int GapPos(const int ref_pos) const { return gap_pos_[ref_pos]; } - const std::string& RefSeq() const { - return ref_seq_; - } + const std::string& RefSeq() const { return ref_seq_; } }; -}// namespace neusomatic - - +} // namespace neusomatic #endif /* ALlAMERICAN_CHANGE_COORDINATES_HPP */ diff --git a/neusomatic/include/msa.hpp b/neusomatic/include/msa.hpp index e0086cb..48a39c6 100644 --- a/neusomatic/include/msa.hpp +++ b/neusomatic/include/msa.hpp @@ -1,18 +1,17 @@ #ifndef MSA_HPP #define MSA_HPP -#include -#include #include +#include +#include #include "bam_variant.hpp" -namespace neusomatic{ - +namespace neusomatic { -template +template class GappedSeq { /* - * This data struct reprents an list of alternative Base + * This data struct reprents an list of alternative Base * and Gap. The Base always starts first. */ std::vector bases_; @@ -21,31 +20,31 @@ class GappedSeq { std::string seq_str_; public: - GappedSeq(unsigned rlen): bases_(rlen), gaps_(rlen) {} - - GappedSeq(const std::string& rseq, const GInv& ginv): bases_(rseq.begin(), rseq.end()), gaps_(rseq.length()), ginv_(ginv) { + GappedSeq(unsigned rlen) : bases_(rlen), gaps_(rlen) {} + + GappedSeq(const std::string& rseq, const GInv& ginv) + : bases_(rseq.begin(), rseq.end()), gaps_(rseq.length()), ginv_(ginv) { /* - * Create a GappedSeq from a ref string (no gaps) and its corresponding genomic interval + * Create a GappedSeq from a ref string (no gaps) and its corresponding genomic + * interval * */ - if (rseq.size() + ginv.left() != ginv.right()) { + if (rseq.size() + ginv.left() != ginv.right()) { throw std::runtime_error("seq length does not match the interval length"); } } void SetGap(const size_t i, const int32_t len) { if (len > gaps_[i]) { - gaps_[i] = len; + gaps_[i] = len; } } - decltype(auto) gaps() { - return (gaps_); - } + decltype(auto) gaps() { return (gaps_); } - decltype(auto) ginv() const {return (ginv_);} - int32_t left() const {return ginv_.left();} - int32_t right() const {return ginv_.right();} - std::vector bases() const {return bases_;} - std::vector gaps() const {return gaps_;} + decltype(auto) ginv() const { return (ginv_); } + int32_t left() const { return ginv_.left(); } + int32_t right() const { return ginv_.right(); } + std::vector bases() const { return bases_; } + std::vector gaps() const { return gaps_; } std::string to_string() const { std::string result; @@ -64,7 +63,7 @@ class GappedSeq { int res = 0; for (size_t i = 0; i < bases_.size(); ++i) { if (i != 0) { - res += gaps_[i]; + res += gaps_[i]; } } res += bases_.size(); @@ -72,62 +71,62 @@ class GappedSeq { } }; -template +template inline std::ostream& operator<<(std::ostream& os, const GappedSeq gs) { - os << gs.ginv() <<":\t"; + os << gs.ginv() << ":\t"; for (size_t i = 0; i < gs.bases().size(); ++i) { if (i != 0) { os << std::string(gs.gaps()[i], '-'); } - os << gs.bases()[i]; + os << gs.bases()[i]; } return os; } -template +template class ReadSeqGap { /* - * Represent a gapped read in the msa. - * Alternate Base with gaps. The Base always comes first. + * Represent a gapped read in the msa. + * Alternate Base with gaps. The Base always comes first. */ - std::vector bases_; - std::vector gapstrs_; // the first element in this vector is not in use. + std::vector bases_; + std::vector gapstrs_; // the first element in this vector is not in use. std::string bases_str_; static const unsigned char missing_chr_ = '~'; static const unsigned char gapchar_ = '-'; -public: - ReadSeqGap(const GappedSeq& refgap, - const std::vector& vars, - const int record_begin, +public: + ReadSeqGap(const GappedSeq& refgap, + const std::vector& vars, const int record_begin, const int record_end) { - // starts from the reference bases and then fill in mutated bases and gaps. - - int msa_len=0; - for (size_t i = 0; i < refgap.gaps().size(); ++ i) { + + int msa_len = 0; + for (size_t i = 0; i < refgap.gaps().size(); ++i) { const auto& gap = refgap.gaps()[i]; gapstrs_[i] = std::string(gap, gapchar_); - msa_len += 1+gap; + msa_len += 1 + gap; } for (auto const& v : vars) { - if (v.Type() == neusomatic::bio::VarType::SUB) { - if (!neusomatic::bio::IsOverlapped(v.ginv(), refgap.ginv())) continue; + if (!neusomatic::bio::IsOverlapped(v.ginv(), refgap.ginv())) + continue; int left = std::max(v.left(), refgap.left()); int right = std::min(v.right(), refgap.right()); for (size_t i = left; i < right; ++i) { bases_[i - refgap.left()] = v.allele()[i - v.left()]; } } else if (v.Type() == neusomatic::bio::VarType::DEL) { - if (!neusomatic::bio::IsOverlapped(v.ginv(), refgap.ginv())) continue; + if (!neusomatic::bio::IsOverlapped(v.ginv(), refgap.ginv())) + continue; int left = std::max(v.left(), refgap.left()); int right = std::min(v.right(), refgap.right()); for (size_t i = left; i < right; ++i) { bases_[i - refgap.left()] = gapchar_; } } else if (v.Type() == neusomatic::bio::VarType::INS) { - if (!neusomatic::bio::IsContainedIn(v.ginv(), refgap.ginv())) continue; + if (!neusomatic::bio::IsContainedIn(v.ginv(), refgap.ginv())) + continue; for (size_t i = 0; i < v.allele().length(); ++i) { gapstrs_[v.left() - refgap.left()][i] = v.allele()[i]; } @@ -146,26 +145,30 @@ class ReadSeqGap { std::string result; for (size_t i = 0; i < bases_.size(); ++i) { if (i != 0) { - result += bases_[i] == missing_chr_ ? std::string(gapstrs_[i].size(), missing_chr_) : gapstrs_[i]; - } + result += bases_[i] == missing_chr_ + ? std::string(gapstrs_[i].size(), missing_chr_) + : gapstrs_[i]; + } result += bases_[i]; } if (result[0] == missing_chr_) { int i = 0; - while (result[++i] == missing_chr_); - for (; result[i] == gapchar_; ++i) result[i] = missing_chr_; + while (result[++i] == missing_chr_) { + } + for (; result[i] == gapchar_; ++i) { + result[i] = missing_chr_; + } } return result; } - }; -template +template class MSABuilder { const std::vector& bam_records_; - //std::vector rids_; - GappedSeq ref_gaps_; - std::vector gapstrs_; // the first element in this vector is not in use. + // std::vector rids_; + GappedSeq ref_gaps_; + std::vector gapstrs_; // the first element in this vector is not in use. int ncol_; std::string gapped_ref_str_; std::vector gap_positions_; @@ -173,35 +176,24 @@ class MSABuilder { static const unsigned char gapchar_ = '-'; public: - MSABuilder() = delete; - using ContigGaps = GappedSeq; + MSABuilder() = delete; + using ContigGaps = GappedSeq; - decltype(auto) bam_records() const { - return (bam_records_); - } + decltype(auto) bam_records() const { return (bam_records_); } - decltype(auto) ref_gaps() const { - return (ref_gaps_); - } + decltype(auto) ref_gaps() const { return (ref_gaps_); } - size_t size() const { - return bam_records_.size(); - } + size_t size() const { return bam_records_.size(); } - const int ncol() const { - return ncol_; - } + const int ncol() const { return ncol_; } - const std::string gapped_ref_str() const { - return gapped_ref_str_; - } + const std::string gapped_ref_str() const { return gapped_ref_str_; } - const int GapPosition(const int p) const { - return gap_positions_[p]; - } + const int GapPosition(const int p) const { return gap_positions_[p]; } - static void BuildRefGap(const std::vector& bams, GappedSeq& refgaps) { - // This loop setup the reference in gapped space. + static void BuildRefGap(const std::vector& bams, + GappedSeq& refgaps) { + // This loop setup the reference in gapped space. for (auto const& r : bams) { auto vars = neusomatic::GetIndels(r); for (auto const& v : vars) { @@ -215,23 +207,22 @@ class MSABuilder { } } - auto GetGappedSeqAndQual(const BamRecord& r) const { - std::string msa_bases(ncol(), missing_chr_); std::string msa_bquals(ncol(), 33); - auto const& cigar = r.GetCigar(); + auto const& cigar = r.GetCigar(); auto const& seq = r.Sequence(); auto const& qual = r.Qualities(); int32_t ref_pos = r.Position(); int32_t read_pos = r.AlignmentPosition(); - auto gapl = GapPosition(std::max(r.Position() , ref_gaps_.left()) - ref_gaps_.left()); - auto gapr = GapPosition(std::min(r.PositionEnd(), ref_gaps_.right()) - 1 - ref_gaps_.left()); + auto gapl = GapPosition(std::max(r.Position(), ref_gaps_.left()) - ref_gaps_.left()); + auto gapr = + GapPosition(std::min(r.PositionEnd(), ref_gaps_.right()) - 1 - ref_gaps_.left()); if (gapl > gapr) { throw std::runtime_error("Invalid read " + r.Qname() + "\n"); } - std::fill(msa_bases.begin() + gapl, msa_bases.begin() + gapr ,'-'); + std::fill(msa_bases.begin() + gapl, msa_bases.begin() + gapr, '-'); if (cigar.begin()->Type() == 'H') { read_pos -= cigar.begin()->Length(); @@ -241,17 +232,18 @@ class MSABuilder { const int l = std::max(ref_pos, ref_gaps_.left()); const int r = std::min(int32_t(ref_pos + c->Length()), ref_gaps_.right()); if (l < r) { - const int s = read_pos + l - ref_pos; + const int s = read_pos + l - ref_pos; auto ss = seq.substr(s, l - r); - auto qq = qual.substr(s, l -r); - + auto qq = qual.substr(s, l - r); + for (int pp = l; pp < r; ++pp) { int gapp = GapPosition(pp - ref_gaps_.left()); - msa_bases[gapp] = ss[pp - l]; + msa_bases[gapp] = ss[pp - l]; msa_bquals[gapp] = qq[pp - l]; if (pp != r - 1) { - for (auto gg = GapPosition(pp - ref_gaps_.left()) + 1 ; gg < GapPosition(pp - ref_gaps_.left() + 1); gg++){ - //msa_bases[gg] = gapchar_; + for (auto gg = GapPosition(pp - ref_gaps_.left()) + 1; + gg < GapPosition(pp - ref_gaps_.left() + 1); gg++) { + // msa_bases[gg] = gapchar_; msa_bquals[gg] = qq[pp - l]; } } @@ -266,13 +258,13 @@ class MSABuilder { const auto gapp = GapPosition(ref_pos - 1 - ref_gaps_.left()); const auto gapp_end = GapPosition(ref_pos - ref_gaps_.left()); auto s = seq.substr(read_pos, c->Length()); - auto q = qual.substr(read_pos, c->Length()); + auto q = qual.substr(read_pos, c->Length()); int pp = 0; for (; pp < s.size(); ++pp) { msa_bases[gapp + pp + 1] = s[pp]; msa_bquals[gapp + pp + 1] = q[pp]; } - for (; gapp + pp + 1 < gapp_end ; ++pp) { + for (; gapp + pp + 1 < gapp_end; ++pp) { msa_bases[gapp + pp + 1] = gapchar_; msa_bquals[gapp + pp + 1] = q.back(); } @@ -287,15 +279,17 @@ class MSABuilder { const unsigned char q = qual[read_pos - 1]; for (int pp = l; pp < r; ++pp) { int gapp = GapPosition(pp - ref_gaps_.left()); - msa_bases[gapp] = gapchar_; + msa_bases[gapp] = gapchar_; msa_bquals[gapp] = q; - for (auto gg = GapPosition(pp - ref_gaps_.left() - 1) + 1 ; gg < GapPosition(pp - ref_gaps_.left()); gg++){ - //msa_bases[gg] = gapchar_; + for (auto gg = GapPosition(pp - ref_gaps_.left() - 1) + 1; + gg < GapPosition(pp - ref_gaps_.left()); gg++) { + // msa_bases[gg] = gapchar_; msa_bquals[gg] = q; } } if (r < ref_gaps_.right()) { - for (auto gg = GapPosition(r - ref_gaps_.left() - 1) + 1 ; gg < GapPosition(r - ref_gaps_.left()); gg++){ + for (auto gg = GapPosition(r - ref_gaps_.left() - 1) + 1; + gg < GapPosition(r - ref_gaps_.left()); gg++) { msa_bquals[gg] = q; } } @@ -303,118 +297,120 @@ class MSABuilder { ref_pos += c->Length(); } } - + return std::make_pair(msa_bases, msa_bquals); } - - MSABuilder(const GInv& ginv, const std::vector& bams, const std::string& refstr): - bam_records_(bams), ref_gaps_(refstr, ginv) { - BuildRefGap(bams, ref_gaps_); + + MSABuilder(const GInv& ginv, const std::vector& bams, + const std::string& refstr) + : bam_records_(bams), ref_gaps_(refstr, ginv) { + BuildRefGap(bams, ref_gaps_); ncol_ = ref_gaps_.length(); gapped_ref_str_ = ref_gaps_.to_string(); gapstrs_.resize(ref_gaps_.bases().size()); - for (size_t i = 0; i < ref_gaps_.gaps().size(); ++ i) { + for (size_t i = 0; i < ref_gaps_.gaps().size(); ++i) { const auto& gap = ref_gaps_.gaps()[i]; gapstrs_[i] = std::string(gap, gapchar_); } gap_positions_.resize(ref_gaps_.bases().size()); - int c=0; + int c = 0; for (size_t i = 0; i < ref_gaps_.bases().size(); ++i) { - if (i !=0) { + if (i != 0) { c += ref_gaps_.gaps()[i]; - } + } gap_positions_[i] = c; c += 1; } } - std::vector GetMSA() const { + std::vector GetMSA() const { std::vector result(bam_records_.size()); for (size_t i = 0; i < bam_records_.size(); ++i) { auto const& r = bam_records_[i]; auto vars = neusomatic::GetVars(r); - ReadSeqGap> row(ref_gaps_, vars, r.Position(), r.PositionEnd()); + ReadSeqGap> row( + ref_gaps_, vars, r.Position(), r.PositionEnd()); result[i] = row.to_string(); } return result; - } + } auto GetClipping(const BamRecord& r) const { int lsc = -1, rsc = -1; - auto const& cigar = r.GetCigar(); - auto front_cigar=cigar.front().Type(); - auto end_cigar=cigar.back().Type(); - int pos_lsc = ((front_cigar=='H')||(front_cigar=='S')) ? r.Position() : -1; - int pos_rsc = ((end_cigar=='H')||(end_cigar=='S')) ? r.PositionEnd()-1 : -1; + auto const& cigar = r.GetCigar(); + auto front_cigar = cigar.front().Type(); + auto end_cigar = cigar.back().Type(); + int pos_lsc = ((front_cigar == 'H') || (front_cigar == 'S')) ? r.Position() : -1; + int pos_rsc = ((end_cigar == 'H') || (end_cigar == 'S')) ? r.PositionEnd() - 1 : -1; - if (ref_gaps_.left() <= pos_lsc && pos_lsc tags(tag_size); - const auto length= r.Sequence().size(); - int32_t nm=0; + const auto length = r.Sequence().size(); + int32_t nm = 0; uint8_t* p1 = bam_aux_get(r.shared_pointer().get(), "NM"); - if (p1){ + if (p1) { nm = bam_aux2i(p1); } - int32_t as=length; + int32_t as = length; uint8_t* p2 = bam_aux_get(r.shared_pointer().get(), "AS"); - if (p2){ + if (p2) { as = bam_aux2i(p2); } - int32_t xs=0; + int32_t xs = 0; uint8_t* p3 = bam_aux_get(r.shared_pointer().get(), "XS"); - if (p3){ + if (p3) { xs = bam_aux2i(p3); } - tags[0]=std::max(std::min(100 - (int) std::rint(((float) nm)/(0.001+length)*100),100),0); - tags[1]=std::max(std::min((int) std::rint(((float) as)/(0.001+length)*100),100),0); - tags[2]=std::max(std::min(100 - (int) std::rint(((float) xs)/(0.001+length)*100),100),0); - tags[3]=r.ProperPair() ? 100 : 0; //bamrecord is mapped in proper pair - tags[4]=(r.NumClip()>0) ? 100: 0; //bamrecord has soft/hard clips + tags[0] = std::max( + std::min(100 - (int)std::rint(((float)nm) / (0.001 + length) * 100), 100), 0); + tags[1] = + std::max(std::min((int)std::rint(((float)as) / (0.001 + length) * 100), 100), 0); + tags[2] = std::max( + std::min(100 - (int)std::rint(((float)xs) / (0.001 + length) * 100), 100), 0); + tags[3] = r.ProperPair() ? 100 : 0; // bamrecord is mapped in proper pair + tags[4] = (r.NumClip() > 0) ? 100 : 0; // bamrecord has soft/hard clips return tags; } - auto GetMSAwithQual() const { - - + auto GetMSAwithQual() const { std::vector result(bam_records_.size()); std::vector bquals(bam_records_.size()); std::vector lscs(bam_records_.size(), -1); std::vector rscs(bam_records_.size(), -1); std::vector mquals(bam_records_.size()); std::vector strands(bam_records_.size()); - std::vector> tags(bam_records_.size(),std::vector(5,0)); + std::vector> tags(bam_records_.size(), std::vector(5, 0)); for (size_t i = 0; i < bam_records_.size(); ++i) { auto const& r = bam_records_[i]; - auto const& cigar = r.GetCigar(); + auto const& cigar = r.GetCigar(); const auto seq_qual = GetGappedSeqAndQual(r); result[i] = seq_qual.first; bquals[i] = seq_qual.second; - + const auto clips = GetClipping(r); - lscs[i] = clips.first; + lscs[i] = clips.first; rscs[i] = clips.second; mquals[i] = r.MapQuality(); - strands[i] = (int) !r.ReverseFlag(); + strands[i] = (int)!r.ReverseFlag(); const auto tag = GetTags(r, 5); tags[i] = tag; } - return std::tuple< std::vector, std::vector, std::vector, - std::vector, std::vector, std::vector, - std::vector> > (result,bquals,mquals,strands,lscs,rscs,tags); - } - - + return std::tuple, std::vector, + std::vector, std::vector, std::vector, + std::vector, std::vector>>( + result, bquals, mquals, strands, lscs, rscs, tags); + } }; -} // namespace neusomatic +} // namespace neusomatic #endif /* ALLAMERICAN_HPP */ diff --git a/neusomatic/include/msa_options.h b/neusomatic/include/msa_options.h index ccbced3..b5ab7b6 100644 --- a/neusomatic/include/msa_options.h +++ b/neusomatic/include/msa_options.h @@ -8,133 +8,141 @@ #ifndef SEQAN_MSA_OPTIONS_H_ #define SEQAN_MSA_OPTIONS_H_ +#include + #include #include #include -#include namespace neusomatic { - static struct option long_options[] = { - {"input", required_argument, 0, 'i'}, - {"output", required_argument, 0, 'o'}, - {"match_score", required_argument, 0, 'A'}, - {"mismatch_penalty", required_argument, 0, 'B'}, - {"gap_open_penalty", required_argument, 0, 'O'}, - {"gap_ext_penalty", required_argument, 0, 'E'}, - {0, 0, 0, 0} // terminator - }; - - const char *short_options = "i:o:A:B:O:E:"; - - void print_help() - { - std::cerr<< "---------------------------------------------------\n"; - std::cerr<< "Usage: msa [options] -i INPUT_FASTA -o OUTPUT_FASTA\n"; - std::cerr<< "General Options:\n"; - - std::cerr<< "Required Options:\n"; - std::cerr<< "-i/--input, input fasta file path, required.\n"; - std::cerr<< "-o/--output, output fasta file path, required.\n"; - std::cerr<< "\n"; - std::cerr<< "Other Options:\n"; - std::cerr<< "-A/--match_score match score, default is 10\n"; - std::cerr<< "-B/--mismatch_penalty penalty for having a mismatch, default is 8\n"; - std::cerr<< "-O/--gap_open_penalty penalty for opening a gap, default is 8\n"; - std::cerr<< "-E/--gap_ext_penalty penalty for extending a gap, default is 6\n"; - } - - int parseInt(const char* optarg, int lower, const char *errmsg, void (*print_help)()) { - long l; - char *endPtr= NULL; - l = strtol(optarg, &endPtr, 10); - if (endPtr != NULL) { - if (l < lower ) { - std::cerr << errmsg << std::endl; - print_help(); - exit(1); - } - return (int32_t)l; - } - std::cerr<< errmsg < upper) - { - std::cerr << errmsg << std::endl; - print_help(); - exit(1); - } +float parseFloat(const char* optarg, float lower, float upper, const char* errmsg, + void (*print_help)()) { + float l; + l = (float)atof(optarg); - return l; + if (l < lower) { + std::cerr << errmsg << std::endl; + print_help(); + exit(1); + } + if (l > upper) { std::cerr << errmsg << std::endl; print_help(); exit(1); - return -1; + } + + return l; + + std::cerr << errmsg << std::endl; + print_help(); + exit(1); + return -1; } - template - int parse_options(int argc, char* argv[], Options& opt) { - int option_index = 0; - int next_option; - do { - next_option = getopt_long(argc, argv, short_options, long_options, &option_index); - switch(next_option) { - case -1: - break; - case 'i': - opt.input() = optarg; - break; - case 'o': - opt.output() = optarg; - break; - case 'A': - opt.match_score() = parseInt(optarg, 0, "match score must >= 0", print_help); - break; - case 'B': - opt.mismatch_score() = parseInt(optarg, 0, "mismatch penalty must >= 0", print_help); - break; - case 'O': - opt.gap_open_score() = parseInt(optarg, 0, "gap open penalty must >= 0", print_help); - break; - case 'E': - opt.gap_extension_score() = parseInt(optarg, 0, "gap extension penalty must >= 0", print_help); - break; - default: - print_help(); - return 1; - } - } while (next_option != -1); - return 0; - } +template +int parse_options(int argc, char* argv[], Options& opt) { + int option_index = 0; + int next_option; + do { + next_option = getopt_long(argc, argv, short_options, long_options, &option_index); + switch (next_option) { + case -1: + break; + case 'i': + opt.input() = optarg; + break; + case 'o': + opt.output() = optarg; + break; + case 'A': + opt.match_score() = parseInt(optarg, 0, "match score must >= 0", print_help); + break; + case 'B': + opt.mismatch_score() = + parseInt(optarg, 0, "mismatch penalty must >= 0", print_help); + break; + case 'O': + opt.gap_open_score() = + parseInt(optarg, 0, "gap open penalty must >= 0", print_help); + break; + case 'E': + opt.gap_extension_score() = + parseInt(optarg, 0, "gap extension penalty must >= 0", print_help); + break; + default: + print_help(); + return 1; + } + } while (next_option != -1); + return 0; +} struct Options { Options(const int argc, char* argv[]) { - std::string cmdline; - for(int i=0; iinput_.empty()) { @@ -148,61 +156,27 @@ struct Options { } } + decltype(auto) input() { return (input_); } + decltype(auto) output() { return (output_); } + decltype(auto) match_score() { return (match_score_); } + decltype(auto) mismatch_score() { return (mismatch_penalty_); } + decltype(auto) gap_open_score() { return (gap_open_penalty_); } + decltype(auto) gap_extension_score() { return (gap_ext_penalty_); } + decltype(auto) input() const { return (input_); } + decltype(auto) output() const { return (output_); } + decltype(auto) match_score() const { return (match_score_); } + decltype(auto) mismatch_score() const { return (mismatch_penalty_); } + decltype(auto) gap_open_score() const { return (gap_open_penalty_); } + decltype(auto) gap_extension_score() const { return (gap_ext_penalty_); } - decltype(auto) input() { - return (input_); - } - decltype(auto) output() { - return (output_); - } - decltype(auto) match_score() { - return (match_score_); - } - - decltype(auto) mismatch_score() { - return (mismatch_penalty_); - } - - decltype(auto) gap_open_score() { - return (gap_open_penalty_); - } - - decltype(auto) gap_extension_score() { - return (gap_ext_penalty_); - } - - decltype(auto) input() const{ - return (input_); - } - decltype(auto) output() const{ - return (output_); - } - decltype(auto) match_score() const{ - return (match_score_); - } - - decltype(auto) mismatch_score() const{ - return (mismatch_penalty_); - } - - decltype(auto) gap_open_score() const{ - return (gap_open_penalty_); - } - - decltype(auto) gap_extension_score() const{ - return (gap_ext_penalty_); - } private: std::string input_; std::string output_; int match_score_ = 10; int mismatch_penalty_ = 8; - int gap_open_penalty_ = 8; + int gap_open_penalty_ = 8; int gap_ext_penalty_ = 6; - }; -}//namespace neusomatic - - +} // namespace neusomatic #endif /* LIB_INCLUDE_OPTIONS_H_ */ diff --git a/neusomatic/include/msa_utils.hpp b/neusomatic/include/msa_utils.hpp index d96960a..64a541c 100644 --- a/neusomatic/include/msa_utils.hpp +++ b/neusomatic/include/msa_utils.hpp @@ -1,65 +1,67 @@ -#ifndef NEU_SOMATIC_MSA_UTIL -#define NEU_SOMATIC_MSA_UTIL +#ifndef NEU_SOMATIC_MSA_UTIL +#define NEU_SOMATIC_MSA_UTIL -#include #include +#include #include "change_coordinates.hpp" #include "msa.hpp" -namespace neusomatic{ - -class Col{ - private: - using Idx = unsigned; - using Base = int; - std::vector bases_; - std::vector bquals_; - - public: - static const int ALPHABET_SIZE = 6; // a, t, c, g, gap and missing_char - static const int TAG_SIZE = 5; - explicit Col(size_t nbases): bases_(nbases), bquals_(nbases), base_freq_(ALPHABET_SIZE), //base_rids_(ALPHABET_SIZE), - bqual_mean(ALPHABET_SIZE), mqual_mean(ALPHABET_SIZE), strand_mean(ALPHABET_SIZE), lsc_mean(ALPHABET_SIZE), - rsc_mean(ALPHABET_SIZE), tag_mean(ALPHABET_SIZE, std::vector(TAG_SIZE)) {} - - Col(): base_freq_(ALPHABET_SIZE), - bqual_mean(ALPHABET_SIZE), mqual_mean(ALPHABET_SIZE), strand_mean(ALPHABET_SIZE), lsc_mean(ALPHABET_SIZE), - rsc_mean(ALPHABET_SIZE), tag_mean(ALPHABET_SIZE, std::vector(TAG_SIZE)) {} - - std::vector base_freq_; - std::vector bqual_mean; - std::vector mqual_mean; - std::vector strand_mean; - std::vector lsc_mean; - std::vector rsc_mean; - std::vector> tag_mean; - decltype(auto) bases() const {return (bases_);} - decltype(auto) bquals() const {return (bquals_);} - - void emplace(const Idx& id, const Base& b) { - bases_[id] = b; - } - - void emplace_qual(const Idx& id, const int& b) { - bquals_[id] = b; - } - - decltype(auto) at(Idx i) const { - return bases_.at(i); - } +namespace neusomatic { - decltype(auto) size() const { - return bases_.size(); - } +class Col { +private: + using Idx = unsigned; + using Base = int; + std::vector bases_; + std::vector bquals_; - decltype(auto) operator[](Idx i) { - return (bases_[i]); - } +public: + static const int ALPHABET_SIZE = 6; // a, t, c, g, gap and missing_char + static const int TAG_SIZE = 5; + explicit Col(size_t nbases) + : bases_(nbases), + bquals_(nbases), + base_freq_(ALPHABET_SIZE), // base_rids_(ALPHABET_SIZE), + bqual_mean(ALPHABET_SIZE), + mqual_mean(ALPHABET_SIZE), + strand_mean(ALPHABET_SIZE), + lsc_mean(ALPHABET_SIZE), + rsc_mean(ALPHABET_SIZE), + tag_mean(ALPHABET_SIZE, std::vector(TAG_SIZE)) {} + + Col() + : base_freq_(ALPHABET_SIZE), + bqual_mean(ALPHABET_SIZE), + mqual_mean(ALPHABET_SIZE), + strand_mean(ALPHABET_SIZE), + lsc_mean(ALPHABET_SIZE), + rsc_mean(ALPHABET_SIZE), + tag_mean(ALPHABET_SIZE, std::vector(TAG_SIZE)) {} + + std::vector base_freq_; + std::vector bqual_mean; + std::vector mqual_mean; + std::vector strand_mean; + std::vector lsc_mean; + std::vector rsc_mean; + std::vector> tag_mean; + decltype(auto) bases() const { return (bases_); } + decltype(auto) bquals() const { return (bquals_); } + + void emplace(const Idx& id, const Base& b) { bases_[id] = b; } + + void emplace_qual(const Idx& id, const int& b) { bquals_[id] = b; } + + decltype(auto) at(Idx i) const { return bases_.at(i); } + + decltype(auto) size() const { return bases_.size(); } + + decltype(auto) operator[](Idx i) { return (bases_[i]); } }; -template -class CondensedArray{ +template +class CondensedArray { public: using Idx = unsigned; static const unsigned char missing_chr_ = '~'; @@ -68,103 +70,75 @@ class CondensedArray{ static const int MISSING_CODE = 5; static int DnaCharToDnaCode(const char& dna) { - switch(dna) { - case 'A': - case 'a': - return 0; - case 'C': - case 'c': - return 1; - case 'G': - case 'g': - return 2; - case 'T': - case 't': - return 3; - case '-': - //case 'N': // do not want N to count - return 4; - default: - return 5; + switch (dna) { + case 'A': + case 'a': + return 0; + case 'C': + case 'c': + return 1; + case 'G': + case 'g': + return 2; + case 'T': + case 't': + return 3; + case '-': + // case 'N': // do not want N to count + return 4; + default: + return 5; } } public: using Val = Base; - using MatrixIdx = Idx; - using ColSpace = std::vector; + using MatrixIdx = Idx; + using ColSpace = std::vector; using TId = unsigned; - decltype(auto) GetColSpace() const { - return (cspace_); - } + decltype(auto) GetColSpace() const { return (cspace_); } - decltype(auto) GetBaseFreq(int i) const { - return (cspace_[i].base_freq_); - } + decltype(auto) GetBaseFreq(int i) const { return (cspace_[i].base_freq_); } - decltype(auto) GetBQMean(int i) const { - return (cspace_[i].bqual_mean); - } + decltype(auto) GetBQMean(int i) const { return (cspace_[i].bqual_mean); } - decltype(auto) GetMQMean(int i) const { - return (cspace_[i].mqual_mean); - } + decltype(auto) GetMQMean(int i) const { return (cspace_[i].mqual_mean); } - decltype(auto) GetStrandMean(int i) const { - return (cspace_[i].strand_mean); - } + decltype(auto) GetStrandMean(int i) const { return (cspace_[i].strand_mean); } - decltype(auto) GetLSCMean(int i) const { - return (cspace_[i].lsc_mean); - } + decltype(auto) GetLSCMean(int i) const { return (cspace_[i].lsc_mean); } - decltype(auto) GetRSCMean(int i) const { - return (cspace_[i].rsc_mean); - } + decltype(auto) GetRSCMean(int i) const { return (cspace_[i].rsc_mean); } - decltype(auto) GetTagMean(int i) const { - return (cspace_[i].tag_mean); - } + decltype(auto) GetTagMean(int i) const { return (cspace_[i].tag_mean); } - size_t ncol() const { - return cspace_.size(); - } + size_t ncol() const { return cspace_.size(); } - size_t nrow() const { - return nrow_; - } + size_t nrow() const { return nrow_; } - decltype(auto) GetGappedRef() const { - return (gapped_ref_str_); - } + decltype(auto) GetGappedRef() const { return (gapped_ref_str_); } - CondensedArray() : nrow_(0) {} + CondensedArray() : nrow_(0) {} - template - explicit CondensedArray(const std::vector& msa, const int& total_cov, const GInv& ginv, const std::string& refgap) : - nrow_(msa.size()), - cspace_(msa[0].size(), - Col(msa.size())), - gapped_ref_str_ (refgap) - { - _CheckInput(msa); + template + explicit CondensedArray(const std::vector& msa, const int& total_cov, + const GInv& ginv, const std::string& refgap) + : nrow_(msa.size()), + cspace_(msa[0].size(), Col(msa.size())), + gapped_ref_str_(refgap) { + _CheckInput(msa); for (size_t i = 0; i < msa.size(); ++i) { auto dna5qseq = _StringToDnaInt(msa[i]); this->row_push(dna5qseq.begin(), dna5qseq.end(), i); } } - template - explicit CondensedArray(const MSA& msa, const bool calculate_qual) : - nrow_(msa.size()), - cspace_(msa.ncol()), - gapped_ref_str_(msa.gapped_ref_str()) - { - + template + explicit CondensedArray(const MSA& msa, const bool calculate_qual) + : nrow_(msa.size()), cspace_(msa.ncol()), gapped_ref_str_(msa.gapped_ref_str()) { // auto start_3p0 = std::chrono::system_clock::now(); - for (int pp = 0; pp < msa.ncol(); ++pp) { cspace_[pp].base_freq_[MISSING_CODE] = nrow_; } @@ -173,19 +147,20 @@ class CondensedArray{ // auto start_3p0p0 = std::chrono::system_clock::now(); auto const& r = msa.bam_records()[i]; - auto const& cigar = r.GetCigar(); + auto const& cigar = r.GetCigar(); const auto seq_qual = msa.GetGappedSeqAndQual(r); const auto& seq = seq_qual.first; // -1,+1 for INS at the begin or end of a read - auto s = msa.GapPosition(std::max(r.Position() - 1 , msa.ref_gaps().left()) - msa.ref_gaps().left()); - auto e = msa.GapPosition(std::min(r.PositionEnd() + 1, msa.ref_gaps().right()) - msa.ref_gaps().left() - 1); + auto s = msa.GapPosition(std::max(r.Position() - 1, msa.ref_gaps().left()) - + msa.ref_gaps().left()); + auto e = msa.GapPosition(std::min(r.PositionEnd() + 1, msa.ref_gaps().right()) - + msa.ref_gaps().left() - 1); // for (int pp = 0; pp < s; ++pp) { // ++cspace_[pp].base_freq_[MISSING_CODE]; // } - // for (int pp = e + 1; pp < msa.ncol(); ++pp){ // ++cspace_[pp].base_freq_[MISSING_CODE]; // } @@ -193,7 +168,7 @@ class CondensedArray{ // auto end_3p0p0 = std::chrono::system_clock::now(); // std::chrono::duration elapsed_seconds_3p0p0 = end_3p0p0-start_3p0p0; // std::time_t end_time_3p0p0 = std::chrono::system_clock::to_time_t(end_3p0p0); - // std::cout << "elapsed_3p0p0 time: " << elapsed_seconds_3p0p0.count() << "s\n"; + // std::cout << "elapsed_3p0p0 time: " << elapsed_seconds_3p0p0.count() << "s\n"; // auto start_3p0p1 = std::chrono::system_clock::now(); for (int pp = s; pp <= e; ++pp) { @@ -204,80 +179,85 @@ class CondensedArray{ // auto end_3p0p1 = std::chrono::system_clock::now(); // std::chrono::duration elapsed_seconds_3p0p1 = end_3p0p1-start_3p0p1; // std::time_t end_time_3p0p1 = std::chrono::system_clock::to_time_t(end_3p0p1); - // std::cout << "elapsed_3p0p1 time: " << elapsed_seconds_3p0p1.count() << "s\n"; + // std::cout << "elapsed_3p0p1 time: " << elapsed_seconds_3p0p1.count() << "s\n"; // auto start_3p0p2 = std::chrono::system_clock::now(); if (calculate_qual) { // auto start_3p0p2p0 = std::chrono::system_clock::now(); const auto& qual = seq_qual.second; - int strand = (int) !r.ReverseFlag(); + int strand = (int)!r.ReverseFlag(); // auto end_3p0p2p0 = std::chrono::system_clock::now(); - // std::chrono::duration elapsed_seconds_3p0p2p0 = end_3p0p2p0-start_3p0p2p0; - // std::time_t end_time_3p0p2p0 = std::chrono::system_clock::to_time_t(end_3p0p2p0); - // std::cout << "elapsed_3p0p2p0 time: " << elapsed_seconds_3p0p2p0.count() << "s\n"; - // auto start_3p0p2p1 = std::chrono::system_clock::now(); + // std::chrono::duration elapsed_seconds_3p0p2p0 = + // end_3p0p2p0-start_3p0p2p0; std::time_t end_time_3p0p2p0 = + // std::chrono::system_clock::to_time_t(end_3p0p2p0); std::cout << + // "elapsed_3p0p2p0 time: " << elapsed_seconds_3p0p2p0.count() << "s\n"; auto + // start_3p0p2p1 = std::chrono::system_clock::now(); const auto clips = msa.GetClipping(r); const auto tags = msa.GetTags(r, 5); // auto end_3p0p2p1 = std::chrono::system_clock::now(); - // std::chrono::duration elapsed_seconds_3p0p2p1 = end_3p0p2p1-start_3p0p2p1; - // std::time_t end_time_3p0p2p1 = std::chrono::system_clock::to_time_t(end_3p0p2p1); - // std::cout << "elapsed_3p0p2p1 time: " << elapsed_seconds_3p0p2p1.count() << "s\n"; - // auto start_3p0p2p2 = std::chrono::system_clock::now(); + // std::chrono::duration elapsed_seconds_3p0p2p1 = + // end_3p0p2p1-start_3p0p2p1; std::time_t end_time_3p0p2p1 = + // std::chrono::system_clock::to_time_t(end_3p0p2p1); std::cout << + // "elapsed_3p0p2p1 time: " << elapsed_seconds_3p0p2p1.count() << "s\n"; auto + // start_3p0p2p2 = std::chrono::system_clock::now(); - if (clips.first != -1) cspace_[clips.first].lsc_mean[DnaCharToDnaCode(seq[clips.first])] ++; - if (clips.second != -1) cspace_[clips.second].rsc_mean[DnaCharToDnaCode(seq[clips.second])] ++; + if (clips.first != -1) + cspace_[clips.first].lsc_mean[DnaCharToDnaCode(seq[clips.first])]++; + if (clips.second != -1) + cspace_[clips.second].rsc_mean[DnaCharToDnaCode(seq[clips.second])]++; // auto end_3p0p2p2 = std::chrono::system_clock::now(); - // std::chrono::duration elapsed_seconds_3p0p2p2 = end_3p0p2p2-start_3p0p2p2; - // std::time_t end_time_3p0p2p2 = std::chrono::system_clock::to_time_t(end_3p0p2p2); - // std::cout << "elapsed_3p0p2p2 time: " << elapsed_seconds_3p0p2p2.count() << "s\n"; - // auto start_3p0p2p3 = std::chrono::system_clock::now(); + // std::chrono::duration elapsed_seconds_3p0p2p2 = + // end_3p0p2p2-start_3p0p2p2; std::time_t end_time_3p0p2p2 = + // std::chrono::system_clock::to_time_t(end_3p0p2p2); std::cout << + // "elapsed_3p0p2p2 time: " << elapsed_seconds_3p0p2p2.count() << "s\n"; auto + // start_3p0p2p3 = std::chrono::system_clock::now(); for (int pp = s; pp <= e; ++pp) { - cspace_[pp].bqual_mean[DnaCharToDnaCode(seq[pp])] += float(qual[pp] - 33) ; + cspace_[pp].bqual_mean[DnaCharToDnaCode(seq[pp])] += float(qual[pp] - 33); cspace_[pp].strand_mean[DnaCharToDnaCode(seq[pp])] += strand; - cspace_[pp].mqual_mean[ DnaCharToDnaCode(seq[pp]) ] += r.MapQuality(); + cspace_[pp].mqual_mean[DnaCharToDnaCode(seq[pp])] += r.MapQuality(); for (size_t ii = 0; ii < Col::TAG_SIZE; ++ii) { - cspace_[pp].tag_mean[ DnaCharToDnaCode(seq[pp]) ][ii] += tags[ii]; + cspace_[pp].tag_mean[DnaCharToDnaCode(seq[pp])][ii] += tags[ii]; } } // auto end_3p0p2p3 = std::chrono::system_clock::now(); - // std::chrono::duration elapsed_seconds_3p0p2p3 = end_3p0p2p3-start_3p0p2p3; - // std::time_t end_time_3p0p2p3 = std::chrono::system_clock::to_time_t(end_3p0p2p3); - // std::cout << "elapsed_3p0p2p3 time: " << elapsed_seconds_3p0p2p3.count() << "s\n"; - + // std::chrono::duration elapsed_seconds_3p0p2p3 = + // end_3p0p2p3-start_3p0p2p3; std::time_t end_time_3p0p2p3 = + // std::chrono::system_clock::to_time_t(end_3p0p2p3); std::cout << + // "elapsed_3p0p2p3 time: " << elapsed_seconds_3p0p2p3.count() << "s\n"; } // auto end_3p0p2 = std::chrono::system_clock::now(); // std::chrono::duration elapsed_seconds_3p0p2 = end_3p0p2-start_3p0p2; // std::time_t end_time_3p0p2 = std::chrono::system_clock::to_time_t(end_3p0p2); - // std::cout << "elapsed_3p0p2 time: " << elapsed_seconds_3p0p2.count() << "s\n"; - + // std::cout << "elapsed_3p0p2 time: " << elapsed_seconds_3p0p2.count() << "s\n"; - }//end for + } // end for // auto end_3p0 = std::chrono::system_clock::now(); // std::chrono::duration elapsed_seconds_3p0 = end_3p0-start_3p0; // std::time_t end_time_3p0 = std::chrono::system_clock::to_time_t(end_3p0); - // std::cout << "elapsed_3p0 time: " << elapsed_seconds_3p0.count() << "s\n"; + // std::cout << "elapsed_3p0 time: " << elapsed_seconds_3p0.count() << "s\n"; // auto start_3p1 = std::chrono::system_clock::now(); if (calculate_qual) { for (size_t ii = 0; ii < msa.ncol(); ++ii) { - auto & col = cspace_[ii]; - for (auto s = 0; s < Col::ALPHABET_SIZE; ++ s) { - if (col.base_freq_[s] == 0) continue; - col.bqual_mean[s]/=col.base_freq_[s]; - col.mqual_mean[s]/=col.base_freq_[s]; - col.strand_mean[s]/=col.base_freq_[s]; - col.strand_mean[s]*=100.0; + auto& col = cspace_[ii]; + for (auto s = 0; s < Col::ALPHABET_SIZE; ++s) { + if (col.base_freq_[s] == 0) + continue; + col.bqual_mean[s] /= col.base_freq_[s]; + col.mqual_mean[s] /= col.base_freq_[s]; + col.strand_mean[s] /= col.base_freq_[s]; + col.strand_mean[s] *= 100.0; for (size_t ii = 0; ii < Col::TAG_SIZE; ++ii) { - col.tag_mean[s][ii]/=col.base_freq_[s]; + col.tag_mean[s][ii] /= col.base_freq_[s]; } } } @@ -286,28 +266,25 @@ class CondensedArray{ // auto end_3p1 = std::chrono::system_clock::now(); // std::chrono::duration elapsed_seconds_3p1 = end_3p1-start_3p1; // std::time_t end_time_3p1 = std::chrono::system_clock::to_time_t(end_3p1); - // std::cout << "elapsed_3p1 time: " << elapsed_seconds_3p1.count() << "s\n"; - - } - - - template - explicit CondensedArray(const std::vector& msa, const std::vector& bqual, - const std::vector& mqual, const std::vector& strand, - const std::vector& lsc, const std::vector& rsc, - const std::vector>& tags, - const int& total_cov, const GInv& ginv, const std::string& refgap): - nrow_(msa.size()), - cspace_(msa[0].size(), Col(msa.size())), - gapped_ref_str_(refgap), - mquals_(nrow_), - strands_(nrow_), - lsc_(nrow_), - rsc_(nrow_), - tags_(nrow_, std::vector(5)) - { - - _CheckInput(msa); + // std::cout << "elapsed_3p1 time: " << elapsed_seconds_3p1.count() << "s\n"; + } + + template + explicit CondensedArray(const std::vector& msa, + const std::vector& bqual, + const std::vector& mqual, const std::vector& strand, + const std::vector& lsc, const std::vector& rsc, + const std::vector>& tags, const int& total_cov, + const GInv& ginv, const std::string& refgap) + : nrow_(msa.size()), + cspace_(msa[0].size(), Col(msa.size())), + gapped_ref_str_(refgap), + mquals_(nrow_), + strands_(nrow_), + lsc_(nrow_), + rsc_(nrow_), + tags_(nrow_, std::vector(5)) { + _CheckInput(msa); for (size_t i = 0; i < msa.size(); ++i) { auto dna5qseq = _StringToDnaInt(msa[i]); @@ -319,11 +296,9 @@ class CondensedArray{ this->row_push_strand(strand[i], i); this->row_push_tag(tags[i], i); } - } - - template + template void row_push(BaseItr b, BaseItr e, const Idx rid) { BaseItr it = b; size_t cid = 0; @@ -332,7 +307,7 @@ class CondensedArray{ } } - template + template void row_push_bqual(BaseItr b, BaseItr e, const Idx rid) { BaseItr it = b; size_t cid = 0; @@ -341,29 +316,21 @@ class CondensedArray{ } } - void row_push_mqual(int mqual, const Idx rid) { - mquals_[rid] = mqual; - } + void row_push_mqual(int mqual, const Idx rid) { mquals_[rid] = mqual; } - void row_push_strand(int strand, const Idx rid) { - strands_[rid] = strand; - } + void row_push_strand(int strand, const Idx rid) { strands_[rid] = strand; } - void row_push_lsc(int pos, const Idx rid) { - lsc_[rid] = pos; - } + void row_push_lsc(int pos, const Idx rid) { lsc_[rid] = pos; } - void row_push_rsc(int pos, const Idx rid) { - rsc_[rid] = pos; - } + void row_push_rsc(int pos, const Idx rid) { rsc_[rid] = pos; } void row_push_tag(std::vector tags, const Idx rid) { for (size_t i = 0; i < tags.size(); ++i) { tags_[rid][i] = tags[i]; - } + } } - template + template void col_push(BaseItr b, BaseItr e, const Idx cid) { BaseItr it = b; size_t i = 0; @@ -383,26 +350,26 @@ class CondensedArray{ void InitWithAlnMetaData() { for (size_t i = 0; i < ncol(); ++i) { - //column-wise + // column-wise auto& col = cspace_[i]; - for (auto b = 0; b < Col::ALPHABET_SIZE; ++ b) { - col.bqual_mean[b]=0; - col.mqual_mean[b]=0; - col.strand_mean[b]=0; - col.lsc_mean[b]=0; - col.rsc_mean[b]=0; + for (auto b = 0; b < Col::ALPHABET_SIZE; ++b) { + col.bqual_mean[b] = 0; + col.mqual_mean[b] = 0; + col.strand_mean[b] = 0; + col.lsc_mean[b] = 0; + col.rsc_mean[b] = 0; for (size_t ii = 0; ii < Col::TAG_SIZE; ++ii) { - col.tag_mean[b][ii]=0; + col.tag_mean[b][ii] = 0; } } - //element-wise + // element-wise for (size_t j = 0; j < nrow(); ++j) { col.base_freq_[col.bases()[j]]++; - col.bqual_mean[col.bases()[j]]+=float(col.bquals()[j]-33); - col.mqual_mean[col.bases()[j]]+=mquals_[j]; - col.strand_mean[col.bases()[j]]+=strands_[j]; + col.bqual_mean[col.bases()[j]] += float(col.bquals()[j] - 33); + col.mqual_mean[col.bases()[j]] += mquals_[j]; + col.strand_mean[col.bases()[j]] += strands_[j]; col.lsc_mean[col.bases()[j]] += lsc_[j] == i ? 1 : 0; col.rsc_mean[col.bases()[j]] += rsc_[j] == i ? 1 : 0; for (size_t ii = 0; ii < Col::TAG_SIZE; ++ii) { @@ -410,31 +377,27 @@ class CondensedArray{ } } - for (auto s = 0; s < Col::ALPHABET_SIZE; ++ s) { - if (col.base_freq_[s] == 0) continue; - col.bqual_mean[s]/=col.base_freq_[s]; - col.mqual_mean[s]/=col.base_freq_[s]; - col.strand_mean[s]/=col.base_freq_[s]; - col.strand_mean[s]*=100.0; + for (auto s = 0; s < Col::ALPHABET_SIZE; ++s) { + if (col.base_freq_[s] == 0) + continue; + col.bqual_mean[s] /= col.base_freq_[s]; + col.mqual_mean[s] /= col.base_freq_[s]; + col.strand_mean[s] /= col.base_freq_[s]; + col.strand_mean[s] *= 100.0; for (size_t ii = 0; ii < Col::TAG_SIZE; ++ii) { - col.tag_mean[s][ii]/=col.base_freq_[s]; + col.tag_mean[s][ii] /= col.base_freq_[s]; } } } } - decltype(auto) total_cov() const { - return (nrow_); - } + decltype(auto) total_cov() const { return (nrow_); } - decltype(auto) cspace() const { - return (cspace_); - } - template + decltype(auto) cspace() const { return (cspace_); } + template friend std::ostream& operator<<(std::ostream&, const CondensedArray&); private: - size_t nrow_; ColSpace cspace_; std::string gapped_ref_str_; @@ -449,8 +412,9 @@ class CondensedArray{ if (msa.empty()) { throw std::runtime_error("empty msa"); } - for(auto const& row : msa) { - if (ncol == 0) ncol = row.length(); + for (auto const& row : msa) { + if (ncol == 0) + ncol = row.length(); else { if (ncol != row.length()) { throw std::runtime_error("input msa has unequal row lengthes"); @@ -459,38 +423,38 @@ class CondensedArray{ } } - std::vector _StringToDnaChar(const std::string &s) { + std::vector _StringToDnaChar(const std::string& s) { std::vector dna5qseq(s.size()); for (size_t j = 0; j < s.size(); ++j) { - switch(s[j]) { - case 'A': - case 'a': - dna5qseq[j] = 'A'; - break; - case 'C': - case 'c': - dna5qseq[j] = 'C'; - break; - case 'T': - case 't': - dna5qseq[j] = 'T'; - break; - case 'G': - case 'g': - dna5qseq[j] = 'G'; - break; - case '-': - dna5qseq[j] = '-'; - break; - default: - dna5qseq[j] = missing_chr_; - break; + switch (s[j]) { + case 'A': + case 'a': + dna5qseq[j] = 'A'; + break; + case 'C': + case 'c': + dna5qseq[j] = 'C'; + break; + case 'T': + case 't': + dna5qseq[j] = 'T'; + break; + case 'G': + case 'g': + dna5qseq[j] = 'G'; + break; + case '-': + dna5qseq[j] = '-'; + break; + default: + dna5qseq[j] = missing_chr_; + break; } } return dna5qseq; } - std::vector _StringToDnaInt(const std::string &s) { + std::vector _StringToDnaInt(const std::string& s) { std::vector dna5qseq(s.size()); for (size_t j = 0; j < s.size(); ++j) { dna5qseq[j] = DnaCharToDnaCode(s[j]); @@ -498,139 +462,161 @@ class CondensedArray{ return dna5qseq; } - void PushBQual_(Idx row, Idx col, Val val) { - cspace_[col].emplace_qual(row, val); - } - - - void Push_(Idx row, Idx col, Val val) { - cspace_[col].emplace(row, val); - } + void PushBQual_(Idx row, Idx col, Val val) { cspace_[col].emplace_qual(row, val); } + void Push_(Idx row, Idx col, Val val) { cspace_[col].emplace(row, val); } }; -template +template std::ostream& operator<<(std::ostream& os, const CondensedArray& ca) { - for (const auto& col: ca.cspace_) { + for (const auto& col : ca.cspace_) { std::string col_s; for (const auto& b : col.base_freq_) { col_s += std::to_string(b) + ":"; } col_s += "\n"; os << col_s; - } + } return os; } -template -decltype(auto) CreateCondensedArray(const std::vector& msa, const int total_cov, const GInv& ginv, const RefGap& refgap) { +template +decltype(auto) CreateCondensedArray(const std::vector& msa, + const int total_cov, const GInv& ginv, + const RefGap& refgap) { using CondensedArray = neusomatic::CondensedArray; CondensedArray condensed_array(msa, total_cov, ginv, refgap); condensed_array.Init(); return condensed_array; } - -template -decltype(auto) CreateCondensedArray(const std::vector& msa, const std::vector& bqual, const std::vector& mqual, const std::vector& strand, - const std::vector& lscs, const std::vector& rscs, - const std::vector>& tag, - const int total_cov, const GInv& ginv, const RefGap& refgap) { - using CondensedArray = neusomatic::CondensedArray; - CondensedArray condensed_array(msa, bqual, mqual, strand, lscs, rscs, tag, total_cov, ginv, refgap); +template +decltype(auto) CreateCondensedArray( + const std::vector& msa, const std::vector& bqual, + const std::vector& mqual, const std::vector& strand, + const std::vector& lscs, const std::vector& rscs, + const std::vector>& tag, const int total_cov, const GInv& ginv, + const RefGap& refgap) { + using CondensedArray = neusomatic::CondensedArray; + CondensedArray condensed_array(msa, bqual, mqual, strand, lscs, rscs, tag, total_cov, + ginv, refgap); condensed_array.InitWithAlnMetaData(); return condensed_array; } - -std::string add_qual_col(auto & data_array, bool is_int=false){ +std::string add_qual_col(auto& data_array, bool is_int = false) { auto sep = ":"; - int order [5] = { 4, 0, 1, 2, 3 }; + int order[5] = {4, 0, 1, 2, 3}; std::string ret = ""; - for ( int n=0 ; n<5 ; ++n ) - { - if (is_int){ + for (int n = 0; n < 5; ++n) { + if (is_int) { ret += std::to_string(data_array[order[n]]); - }else{ + } else { ret += std::to_string(int(round(data_array[order[n]]))); } - if (n < 4){ - ret+=":"; + if (n < 4) { + ret += ":"; } } return ret; } - -void add_qual_col(auto & count_writer, auto & data_array, bool is_int, const std::string& end_str ){ +void add_qual_col(auto& count_writer, auto& data_array, bool is_int, + const std::string& end_str) { auto sep = ":"; - int order [5] = { 4, 0, 1, 2, 3 }; - if (is_int){ - count_writer << (data_array[order[0]])<<":"<<(data_array[order[1]])<<":"<<(data_array[order[2]])<<":"<<(data_array[order[3]])<<":"<<(data_array[order[4]])<(result, ":")); - // cout << result<(result, + // ":")); cout << result< +#include + #include #include +#include -#include #include "Options.h" -namespace neusomatic{ +namespace neusomatic { -template +template class CaptureLayout { size_t region_idx_; - const neusomatic::Options & opts_; + const neusomatic::Options& opts_; BamReader bam_reader_; std::vector ginvs_; - //std::map > contig_ginv_; - std::map > ginv_records_; + // std::map > contig_ginv_; + std::map> ginv_records_; public: - template - CaptureLayout(const std::string& bam_path, const std::vector& regions, const neusomatic::Options& o): - region_idx_(0), opts_(o) { - bam_reader_.Open(bam_path); - LoadRegions_(regions); + template + CaptureLayout(const std::string& bam_path, const std::vector& regions, + const neusomatic::Options& o) + : region_idx_(0), opts_(o) { + bam_reader_.Open(bam_path); + LoadRegions_(regions); } - decltype(auto) ginv_records() const { - return (ginv_records_); - } + decltype(auto) ginv_records() const { return (ginv_records_); } - decltype(auto) Reader() const { - return (bam_reader_); - } + decltype(auto) Reader() const { return (bam_reader_); } - size_t NumRegion() const { - return ginvs_.size(); - } + size_t NumRegion() const { return ginvs_.size(); } bool NextRegion(const bool full_contained) { - if (region_idx_ == ginvs_.size()) return false; + if (region_idx_ == ginvs_.size()) + return false; ginv_records_.clear(); const GInv& curr_ginv = ginvs_[region_idx_]; SeqLib::GenomicRegion gr(curr_ginv.contig(), curr_ginv.left(), curr_ginv.right()); - bam_reader_.SetRegion(gr); + bam_reader_.SetRegion(gr); std::vector records; BamRecord rec; while (bam_reader_.GetNextRecord(rec)) { bool good = true; if (full_contained) { - if (rec.Position() > curr_ginv.left() || rec.PositionEnd() + 1 < curr_ginv.right()) { + if (rec.Position() > curr_ginv.left() || + rec.PositionEnd() + 1 < curr_ginv.right()) { good = false; } - } - else { - if (rec.Position() >= curr_ginv.right() || curr_ginv.left() > rec.PositionEnd()) { // overlapped + } else { + if (rec.Position() >= curr_ginv.right() || + curr_ginv.left() > rec.PositionEnd()) { // overlapped good = false; } } - if (rec.MapQuality() < opts_.min_mapq()) { // mapq filter - good = false; + if (rec.MapQuality() < opts_.min_mapq()) { // mapq filter + good = false; } - if (!opts_.include_secondary() && rec.SecondaryFlag()) { // secondary flag - good = false; + if (!opts_.include_secondary() && rec.SecondaryFlag()) { // secondary flag + good = false; } - if (opts_.filter_duplicate() && rec.DuplicateFlag()) { // duplicate flag + if (opts_.filter_duplicate() && rec.DuplicateFlag()) { // duplicate flag good = false; } - if (opts_.filter_QCfailed() && rec.QCFailFlag()) { // QC failed flag + if (opts_.filter_QCfailed() && rec.QCFailFlag()) { // QC failed flag good = false; } - if (opts_.filter_improper_pair() && !rec.ProperPair()) { // Proper pair flag. Set by aligner + if (opts_.filter_improper_pair() && + !rec.ProperPair()) { // Proper pair flag. Set by aligner good = false; } - if (opts_.filter_mate_unmapped() && !rec.MateMappedFlag()) { // Mate is ummapped + if (opts_.filter_mate_unmapped() && !rec.MateMappedFlag()) { // Mate is ummapped good = false; } - if (opts_.filter_improper_orientation() && !rec.ProperOrientation()) { // pair read has proper orientation (FR) and on same chrom. + if (opts_.filter_improper_orientation() && + !rec.ProperOrientation()) { // pair read has proper orientation (FR) and on + // same chrom. good = false; } if (good) { - if(rec.GetCigar().size() == 0) { + if (rec.GetCigar().size() == 0) { std::cerr << "warning: " << rec.Qname() << " has no cigar\n"; } else if (rec.Position() == rec.PositionEnd()) { std::cerr << "warning: " << rec.Qname() << " has no acutall alignment\n"; - } - else { + } else { records.push_back(rec); } - } + } } ginv_records_[curr_ginv] = records; - region_idx_ ++; + region_idx_++; return true; } private: - template + template void LoadRegions_(const std::vector& regions) { - ginvs_.resize(regions.size()); + ginvs_.resize(regions.size()); for (size_t i = 0; i < regions.size(); ++i) { - ginvs_[i] = GInv(bam_reader_.Header().Name2ID(regions[i].contig()), regions[i].left(), regions[i].right()); + ginvs_[i] = GInv(bam_reader_.Header().Name2ID(regions[i].contig()), + regions[i].left(), regions[i].right()); } } - }; -}//namespace neusomatic +} // namespace neusomatic #endif /* LIB_INCLUDE_FRAGSTORESEQAN_HPP_ */ diff --git a/neusomatic/include/variant.hpp b/neusomatic/include/variant.hpp index 1853149..5087509 100644 --- a/neusomatic/include/variant.hpp +++ b/neusomatic/include/variant.hpp @@ -1,63 +1,54 @@ #ifndef VARIANT_HPP #define VARIANT_HPP -#include -#include -#include "SeqanUtils.h" +#include +#include + #include "Interval.hpp" +#include "SeqanUtils.h" //#include /* - * Standard: any positions(or interval) of a vcf variant should + * Standard: any positions(or interval) of a vcf variant should * refer to reference positions. */ -namespace neusomatic { namespace bio { - - -enum class VarType{ - DEL = 0, - INS, - SUB -}; - -inline std::ostream& operator<<(std::ostream &os, const VarType& var) -{ - switch(var) - { - case VarType::DEL: - os<<"D"; - break; - case VarType::INS: - os<<"I"; - break; - case VarType::SUB: - os<<"S"; - break; +namespace neusomatic { +namespace bio { + +enum class VarType { DEL = 0, INS, SUB }; + +inline std::ostream& operator<<(std::ostream& os, const VarType& var) { + switch (var) { + case VarType::DEL: + os << "D"; + break; + case VarType::INS: + os << "I"; + break; + case VarType::SUB: + os << "S"; + break; } return os; -} - +} ///////////////////// //////////////////// - - -template -class Variant{ +template +class Variant { public: - typedef T1 TSeq; + typedef T1 TSeq; typedef T2 TContig; typedef neusomatic::bio::GenomicInterval TInv; - /* * Deletion type variant ctor * 6 paramters */ - //Variant(): del_len_(0), ins_len_(0){ - //std::cerr<<"in default ctor ginv is: "< allele_.length()) { - return VarType::DEL; + return VarType::DEL; } else if (ginv_.length() < allele_.length()) { return VarType::INS; } else { @@ -115,48 +98,46 @@ class Variant{ } } -// bool IsSnp() const { -// return length() == seqan::length(allele_); -// } - + // bool IsSnp() const { + // return length() == seqan::length(allele_); + // } -/* - * - */ + /* + * + */ private: int id_ = INT_MIN; TInv ginv_; TSeq allele_; - //bool final_; + // bool final_; }; -template -inline bool operator==(const Variant& lhs, const Variant& rhs) -{ - if (lhs.ginv() != rhs.ginv()) return false; - if (lhs.allele() != rhs.allele()) return false; +template +inline bool operator==(const Variant& lhs, + const Variant& rhs) { + if (lhs.ginv() != rhs.ginv()) + return false; + if (lhs.allele() != rhs.allele()) + return false; return true; } -template -inline bool operator!=(const Variant& lhs, const Variant& rhs) -{ +template +inline bool operator!=(const Variant& lhs, + const Variant& rhs) { return !(lhs == rhs); } -template -inline bool operator<(const Variant& lhs, const Variant& rhs) -{ +template +inline bool operator<(const Variant& lhs, + const Variant& rhs) { return lhs.ginv() < rhs.ginv(); } - -template -inline std::ostream& operator<<(std::ostream& os, const Variant& var) -{ - os << var.ginv() - <<":"< +inline std::ostream& operator<<(std::ostream& os, const Variant& var) { + os << var.ginv() << ":" << neusomatic::bio::to_string(var.allele()); return os; } @@ -167,35 +148,33 @@ inline std::ostream& operator<<(std::ostream& os, const Variant& var) * */ -template -class VariantGroup{ -/* - * TGaps should be in compliance with seqan Gaps type - */ +template +class VariantGroup { + /* + * TGaps should be in compliance with seqan Gaps type + */ public: using Variant = TVariant; - typedef typename TVariant::TSeq TSeq; + typedef typename TVariant::TSeq TSeq; typedef typename TVariant::TContig TContig; typedef typename TVariant::TInv TInv; -//TContig chromosome - VariantGroup(TContig ctg, StrandType strand, const neusomatic::bio::RefSeqs& ref_seqs): - ginv_(ctg, strand), ref_seqs_(&ref_seqs){} + // TContig chromosome + VariantGroup(TContig ctg, StrandType strand, const neusomatic::bio::RefSeqs& ref_seqs) + : ginv_(ctg, strand), ref_seqs_(&ref_seqs) {} - VariantGroup(TContig ctg, const neusomatic::bio::RefSeqs& ref_seqs): - ginv_(ctg, StrandType::UNKNOWN), ref_seqs_(&ref_seqs){} - - typename TVariant::TSeq GetRefAllele() const - { + VariantGroup(TContig ctg, const neusomatic::bio::RefSeqs& ref_seqs) + : ginv_(ctg, StrandType::UNKNOWN), ref_seqs_(&ref_seqs) {} + typename TVariant::TSeq GetRefAllele() const { const auto& ref_seq = (*ref_seqs_)[contig()]; typename TVariant::TSeq ref_allele; int vgleft = ginv_.left(); int vgright = ginv_.right(); - //if (IsSnp()) ++vgleft; + // if (IsSnp()) ++vgleft; - for(int i = vgleft; i < vgright; ++i){ + for (int i = vgleft; i < vgright; ++i) { ref_allele += ref_seq[i]; } return ref_allele; @@ -203,48 +182,47 @@ class VariantGroup{ decltype(auto) GetAltAlleles() const { std::map var_seq_map; - for(auto itr = cid_vid_.begin(); itr != cid_vid_.end(); ++itr) { - auto search = var_seq_map.find(itr->second); + for (auto itr = cid_vid_.begin(); itr != cid_vid_.end(); ++itr) { + auto search = var_seq_map.find(itr->second); if (search == var_seq_map.end()) { var_seq_map.emplace(itr->second, GetAltAlleles_(itr->second)); } else { - //std::cerr<<"varaints have existed in other consensus. skipping..."<second], new_var); - if (!var_to_add.Valid()) return; + if (!var_to_add.Valid()) + return; } - //then add the variant + // then add the variant size_t vid; const auto& found = std::find(variants_.begin(), variants_.end(), var_to_add); - if (found != variants_.end()) { //existing variants. + if (found != variants_.end()) { // existing variants. vid = std::distance(variants_.begin(), found); cid_vid_[sid] = vid; } else { // new variant this->ginv_.left() = std::min(this->left(), new_var.left()); #ifdef DEBUG if (new_var.contig() != this->contig()) { - std::cerr<<"Warning: a variant from different contig cannot be add to the VariantGroup!\n"; + std::cerr << "Warning: a variant from different contig cannot be add to the " + "VariantGroup!\n"; return; } #endif @@ -257,20 +235,14 @@ class VariantGroup{ } } + bool empty() const { return variants_.empty(); } - bool empty() const { - return variants_.empty(); - } - - size_t size() const { - return variants_.size(); - } - - decltype(auto) ginv() const {return (ginv_);} + size_t size() const { return variants_.size(); } + decltype(auto) ginv() const { return (ginv_); } - decltype(auto) contig() const {return (ginv_.contig());} - decltype(auto) vid_cid() const {return (vid_cid_);} + decltype(auto) contig() const { return (ginv_.contig()); } + decltype(auto) vid_cid() const { return (vid_cid_); } decltype(auto) Init() { for (auto it = cid_vid_.cbegin(); it != cid_vid_.cend(); ++it) { @@ -279,11 +251,11 @@ class VariantGroup{ } private: - - //unique set. But it may contain deprecated variants due to MergeVaraint() + // unique set. But it may contain deprecated variants due to MergeVaraint() std::vector variants_; - std::map cid_vid_; //map consensus index to variant index - std::map> vid_cid_; // map var index to list of consensus ids + std::map cid_vid_; // map consensus index to variant index + std::map> + vid_cid_; // map var index to list of consensus ids TInv ginv_; @@ -293,7 +265,8 @@ class VariantGroup{ typename TVariant::TSeq seq; #ifdef DEBUG if (left.contig() != left.contig() || right < left) { - std::cerr<<"Warning: variant "<right(); ++curr) { - alt_allele += ref_seq[curr]; + alt_allele += ref_seq[curr]; } return alt_allele; } }; -template -inline std::ostream& operator<<(std::ostream& os, const VariantGroup& vg) -{ - int i=1; - os << "Variant Group: " < +inline std::ostream& operator<<(std::ostream& os, const VariantGroup& vg) { + int i = 1; + os << "Variant Group: " << vg.ginv() << " has " << vg.size() << " variants."; + for (const auto& var : vg.variants()) { + os << "\n"; + os << "variant " << std::to_string(i) << ": " << var << "\n"; ++i; } return os; } - -}//namespace bio -}//namespace neusomatic +} // namespace bio +} // namespace neusomatic #endif /* VARIANT_HPP */ diff --git a/neusomatic/include/vcf.h b/neusomatic/include/vcf.h index fd2bbd6..f98832e 100644 --- a/neusomatic/include/vcf.h +++ b/neusomatic/include/vcf.h @@ -1,33 +1,33 @@ -#ifndef NEU_SOMATIC_VCF_H +#ifndef NEU_SOMATIC_VCF_H #define NEU_SOMATIC_VCF_H -#include #include + +#include + #include "RefSeqs.h" namespace neusomatic { - namespace bio { +namespace bio { class VCFWriter { public: - VCFWriter(const std::string& vcf_file, const std::string& ref_file, const std::string& program_name) { + VCFWriter(const std::string& vcf_file, const std::string& ref_file, + const std::string& program_name) { if (!open(vcf_, vcf_file.c_str())) { throw std::runtime_error("Coult not open " + vcf_file + " for writing."); } WriteHeader(ref_file, program_name); } - void Write(seqan::VcfRecord /*const&*/ record) { // because seqan messed up constness + void Write(seqan::VcfRecord /*const&*/ record) { // because seqan messed up constness writeRecord(vcf_, record); } - decltype(auto) vcf_context() { - return context(vcf_); - } + decltype(auto) vcf_context() { return context(vcf_); } private: seqan::VcfFileOut vcf_; void WriteHeader(const std::string& ref_file, const std::string& program_name) { - neusomatic::bio::RefSeqs ref_seqs(ref_file); for (size_t i = 0; i < ref_seqs.size(); ++i) { appendValue(contigNames(context(vcf_)), ref_seqs.GetName(i)); @@ -36,13 +36,19 @@ class VCFWriter { appendValue(header, seqan::VcfHeaderRecord("fileformat", "VCFv4.2")); appendValue(header, seqan::VcfHeaderRecord("source", program_name)); appendValue(header, seqan::VcfHeaderRecord("reference", ref_file)); - //appendValue(header, seqan::VcfHeaderRecord("INFO", "")); - appendValue(header, seqan::VcfHeaderRecord("INFO", "")); - //appendValue(header, seqan::VcfHeaderRecord("FORMAT", "")); + // appendValue(header, seqan::VcfHeaderRecord("INFO", + // "")); + appendValue( + header, + seqan::VcfHeaderRecord( + "INFO", "")); + // appendValue(header, seqan::VcfHeaderRecord("FORMAT", + // "")); writeHeader(vcf_, header); } }; -}} +} // namespace bio +} // namespace neusomatic #endif