Skip to content

Commit

Permalink
Merge pull request #12 from joshuak94/bug_fix
Browse files Browse the repository at this point in the history
Bug fix
  • Loading branch information
smehringer authored Nov 27, 2019
2 parents 396512f + b88d08d commit 72d1e0c
Show file tree
Hide file tree
Showing 10 changed files with 94 additions and 77 deletions.
20 changes: 10 additions & 10 deletions include/sviper/auxiliary.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,16 @@ struct input_output_information{
// Custom constructor.
input_output_information(CmdOptions & options_) : cmd_options(options_) {}

std::ofstream log_file;
CmdOptions cmd_options;
std::vector<std::unique_ptr<seqan::BamFileIn>> long_read_file_handles;
seqan::BamHeader long_read_header; // The bam header object needed to fill bam context
seqan::BamIndex<seqan::Bai> long_read_bai;
std::vector<std::unique_ptr<seqan::BamFileIn>> short_read_file_handles;
seqan::BamHeader short_read_header; // The bam header object needed to fill bam context
seqan::BamIndex<seqan::Bai> short_read_bai;
std::vector<std::unique_ptr<seqan::FaiIndex>> faidx_file_handles;
std::vector<seqan::BamAlignmentRecord> polished_reads; // stores records in case info.cmd_options.output-polished-bam is true
std::ofstream log_file{};
CmdOptions cmd_options{};
std::vector<std::unique_ptr<seqan::BamFileIn>> long_read_file_handles{};
seqan::BamHeader long_read_header{}; // The bam header object needed to fill bam context
seqan::BamIndex<seqan::Bai> long_read_bai{};
std::vector<std::unique_ptr<seqan::BamFileIn>> short_read_file_handles{};
seqan::BamHeader short_read_header{}; // The bam header object needed to fill bam context
seqan::BamIndex<seqan::Bai> short_read_bai{};
std::vector<std::unique_ptr<seqan::FaiIndex>> faidx_file_handles{};
std::vector<seqan::BamAlignmentRecord> polished_reads{}; // stores records in case info.cmd_options.output-polished-bam is true
};

bool prep_file_handles(input_output_information & info)
Expand Down
22 changes: 14 additions & 8 deletions include/sviper/basics.h
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ void advance_in_cigar(unsigned & cigar_pos,
if (stop_criterion(ref_pos, read_pos))
break;

if ((cigar[cigar_pos]).operation == 'M')
if ((cigar[cigar_pos]).operation == 'M' || (cigar[cigar_pos]).operation == 'X' || (cigar[cigar_pos]).operation == '=')
{
// Matches/mismatches advance both the read and the reference position
read_pos += (cigar[cigar_pos]).count;
Expand All @@ -162,6 +162,12 @@ void advance_in_cigar(unsigned & cigar_pos,
ref_pos += (cigar[cigar_pos]).count;
}
// else hard-clips ('H') advance neither the read nor the reference position
else if ((cigar[cigar_pos]).operation == 'H')
{}
else
{
throw std::logic_error{"UNKNOWN CIGAR CHARACTER."};
}

++cigar_pos;
}
Expand All @@ -184,8 +190,8 @@ inline std::tuple<int, int> get_read_region_boundaries(seqan::BamAlignmentRecord
int ref_region_begin,
int ref_region_end)
{
int read_region_begin;
int read_region_end;
int read_region_begin{};
int read_region_end{};

if (ref_region_begin >= ref_region_end)
{
Expand Down Expand Up @@ -309,7 +315,7 @@ inline void records_to_read_pairs(seqan::StringSet<seqan::Dna5QString> & reads1,
records.erase(last, records.end());

seqan::Dna5QString dummy_seq{}; // read sequence for dummy record is an empty string
seqan::CharString id1; // stores last seen id
seqan::CharString id1{}; // stores last seen id

for (unsigned i = 0; i < length(records); ++i)
{
Expand Down Expand Up @@ -398,7 +404,7 @@ inline void cut_down_high_coverage(std::vector<seqan::BamAlignmentRecord> & shor
if (short_reads.size() == 0)
return;

std::vector<seqan::BamAlignmentRecord> tmp;
std::vector<seqan::BamAlignmentRecord> tmp{};
std::sort(short_reads.begin(), short_reads.end(), bamRecordPosLessQualityGreater());

std::vector<int32_t> ends{};
Expand Down Expand Up @@ -477,15 +483,15 @@ inline seqan::Dna5String build_consensus(seqan::StringSet<seqan::Dna5String> con
std::vector<double> const & quals) // mapping qualities
{
// TODO:: include base qualities from bam file
seqan::Align<seqan::Dna5String> align;
seqan::Align<seqan::Dna5String> align{};
seqan::resize(seqan::rows(align), seqan::length(seqs));
for (unsigned i = 0; i < seqan::length(seqs); ++i)
seqan::assignSource(seqan::row(align, i), seqs[i]);

seqan::globalMsaAlignment(align, seqan::SimpleScore(5, -3, -1, -3));

seqan::Dna5String consensus;
seqan::String<seqan::ProfileChar<seqan::Dna5, double> > profile;
seqan::Dna5String consensus{};
seqan::String<seqan::ProfileChar<seqan::Dna5, double> > profile{};
seqan::resize(profile, 1);

// fill profile and get maximum supported character for each position
Expand Down
24 changes: 12 additions & 12 deletions include/sviper/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,12 @@ struct CmdOptions
double mean_insert_size_of_short_reads{280.054}; // original coverage
double stdev_insert_size_of_short_reads{145.162};
int length_of_short_reads{150}; // original coverage
std::string long_read_file_name;
std::string short_read_file_name;
std::string candidate_file_name;
std::string output_prefix;
std::string reference_file_name;
std::string log_file_name;
std::string long_read_file_name{};
std::string short_read_file_name{};
std::string candidate_file_name{};
std::string output_prefix{};
std::string reference_file_name{};
std::string log_file_name{};

CmdOptions(std::string long_, std::string short_, std::string candidate_, std::string ref_) :
threads(std::thread::hardware_concurrency()),
Expand Down Expand Up @@ -49,11 +49,11 @@ struct SViperConfig
bool verbose{false};

// options to be set through the command line options
int flanking_region; // size of flanking region for breakpoints
int mean_coverage_of_short_reads; // original coverage
double mean_insert_size_of_short_reads;
double stdev_insert_size_of_short_reads;
int length_of_short_reads;
int flanking_region{}; // size of flanking region for breakpoints
int mean_coverage_of_short_reads{}; // original coverage
double mean_insert_size_of_short_reads{};
double stdev_insert_size_of_short_reads{};
int length_of_short_reads{};

unsigned ref_flank_length{500}; // length to flank to the consensus sequence with the reference

Expand All @@ -74,7 +74,7 @@ struct SViperConfig
double alpha{1}; // scaling factor such that mapping and base quality are comparable
double mean_coverage{0}; // will be overridden every round after mapping
double min_coverage{4};
std::vector<unsigned> cov_profile;
std::vector<unsigned> cov_profile{};

/* Polishing statistics (total base count).
* Note: The total number can exceed the length easily since bases can be
Expand Down
4 changes: 2 additions & 2 deletions include/sviper/io.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ namespace sviper
{
bool read_vcf(std::vector<Variant> & variants, std::vector<std::string> & vcf_header, input_output_information & info)
{
std::ifstream input_vcf; // The candidate variants to polish
std::ifstream input_vcf{}; // The candidate variants to polish

if (!open_file_success(input_vcf, info.cmd_options.candidate_file_name.c_str()))
return false;
Expand All @@ -32,7 +32,7 @@ bool read_vcf(std::vector<Variant> & variants, std::vector<std::string> & vcf_he

bool write_vcf(std::vector<Variant> & variants, std::vector<std::string> & vcf_header, input_output_information & info)
{
std::ofstream output_vcf; // The polished variant as output
std::ofstream output_vcf{}; // The polished variant as output
if (!open_file_success(output_vcf, (info.cmd_options.output_prefix + ".vcf").c_str()))
return false;

Expand Down
14 changes: 7 additions & 7 deletions include/sviper/mapping.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,15 @@ struct Mapping_object
typedef seqan::Gaps<TRead, seqan::ArrayGaps> TGapsRead;
typedef seqan::Gaps<TRef, seqan::ArrayGaps> TGapsRef;

TRead read;
TGapsRead gapsRead;
TGapsRef gapsRef;
TRead read{};
TGapsRead gapsRead{};
TGapsRef gapsRef{};
bool read_is_rc{false}; // rc = reverse-complemented
double mapQRead{0.0};

TRead mate;
TGapsRead gapsMate;
TGapsRef gapsRefMate;
TRead mate{};
TGapsRead gapsMate{};
TGapsRef gapsRefMate{};
bool mate_is_rc{false}; // rc = reverse-complemented
double mapQMate{0.0};

Expand Down Expand Up @@ -152,7 +152,7 @@ std::vector<Mapping_object> mapping(seqan::StringSet<seqan::Dna5QString> const &
{
SEQAN_ASSERT_EQ(seqan::length(reads1), seqan::length(reads2));

std::vector<Mapping_object> mobs;
std::vector<Mapping_object> mobs{};
//mobs.resize(length(reads1)); TODO

for (unsigned ridx = 0; ridx < seqan::length(reads1); ++ridx) // for every read (pair)
Expand Down
16 changes: 11 additions & 5 deletions include/sviper/merge_split_alignments.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,17 +94,23 @@ inline void truncate_cigar_right(seqan::BamAlignmentRecord & record, int until)
}

if ((record.cigar[length(record.cigar) - 1]).operation != 'M' &&
(record.cigar[length(record.cigar) - 1]).operation != 'X' &&
(record.cigar[length(record.cigar) - 1]).operation != '=' &&
(record.cigar[length(record.cigar) - 1]).operation != 'I') // meaning == H,S or D
{
if ((record.cigar[length(record.cigar) - 1]).operation == 'D' ||
(record.cigar[length(record.cigar) - 1]).operation == 'H')
{ // continuing deletion in record must be added to the large DEL event
seqan::erase(record.cigar, length(record.cigar) - 1);
}
else // meaning == S
else if ((record.cigar[length(record.cigar) - 1]).operation == 'S')
{
(record.cigar[length(record.cigar) - 1]).operation = 'I'; // add those bases
}
else
{
throw std::logic_error{"UNKNOWN CIGAR CHARACTER."};
}
}
}

Expand Down Expand Up @@ -246,8 +252,8 @@ inline void turn_hard_clipping_to_soft_clipping(seqan::BamAlignmentRecord & prim
// Note: common hard clipping is removed from the cigar string.
// hard clipping in one is turned to soft clipping by appending part
// of the sequence of the other
decltype(prim.seq) final_prim_seq;
decltype(supp.seq) final_supp_seq;
decltype(prim.seq) final_prim_seq{};
decltype(supp.seq) final_supp_seq{};

// beginning of alignment
if ((prim.cigar[0]).operation == 'H')
Expand Down Expand Up @@ -518,8 +524,8 @@ inline seqan::BamAlignmentRecord merge_record_group(std::vector<seqan::BamAlignm

inline std::vector<seqan::BamAlignmentRecord> merge_alignments(std::vector<seqan::BamAlignmentRecord> const & records)
{
std::vector<seqan::BamAlignmentRecord> merged_records;
std::vector<seqan::BamAlignmentRecord> record_group;
std::vector<seqan::BamAlignmentRecord> merged_records{};
std::vector<seqan::BamAlignmentRecord> record_group{};

for (auto const & rec : records)
{
Expand Down
10 changes: 5 additions & 5 deletions include/sviper/polishing.h
Original file line number Diff line number Diff line change
Expand Up @@ -116,16 +116,16 @@ void fill_profile(seqan::String<seqan::ProfileChar<seqan::Dna5, double> > & prof
// every position. This will account for deletions and substitutions.
// Insertions are first stored seperately because otherwise the positions
// in read-ref-alignments are not "synchronized" and hard to handle.
std::vector<seqan::String<seqan::ProfileChar<seqan::Dna5, double> >> insertion_profiles;
std::vector<std::vector<unsigned>> insertion_cov_profiles;
std::vector<seqan::String<seqan::ProfileChar<seqan::Dna5, double> >> insertion_profiles{};
std::vector<std::vector<unsigned>> insertion_cov_profiles{};
insertion_profiles.resize(seqan::length(profile)); // there can be an insertion after every position
insertion_cov_profiles.resize(seqan::length(profile));

// the ins_cov_profile count the coverage of insertions for every little profile
// but additionally, one needs to count the number of reads that span the position
// and do NOT display an insertion. Those are counted with non_insertion_coverage_count
// for each read and the insertion profile is updated accordingly.
std::vector<unsigned> non_insertion_coverage_count;
std::vector<unsigned> non_insertion_coverage_count{};
non_insertion_coverage_count.resize(seqan::length(profile));
for (auto & cov_count : non_insertion_coverage_count)
cov_count = 0;
Expand Down Expand Up @@ -239,7 +239,7 @@ inline seqan::Dna5String consensus_from_profile(seqan::String<seqan::ProfileChar
{
SEQAN_ASSERT(seqan::length(profile) == seqan::length(contigGaps));

seqan::Dna5String cns;
seqan::Dna5String cns{};
unsigned begin = seqan::toViewPosition(contigGaps, config.ref_flank_length);
unsigned end = seqan::toViewPosition(contigGaps, seqan::length(seqan::source(contigGaps)) - config.ref_flank_length);

Expand Down Expand Up @@ -321,7 +321,7 @@ seqan::Dna5String polish_to_perfection(seqan::StringSet<seqan::Dna5QString> cons
seqan::Dna5String ref,
SViperConfig & config)
{
seqan::Dna5String old_ref;
seqan::Dna5String old_ref{};

// first, do only fix mismatches
while (ref != old_ref && config.rounds < 20)
Expand Down
10 changes: 5 additions & 5 deletions include/sviper/stats.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,19 @@ namespace sviper
{
inline std::pair<double, double> stats_insert_size(std::string const & bam_file_name, std::string contig_name)
{
seqan::BamFileIn bam_file;
seqan::BamIndex<seqan::Bai> bam_index;
seqan::BamFileIn bam_file{};
seqan::BamIndex<seqan::Bai> bam_index{};

if (!open_file_success(bam_file, bam_file_name.c_str()))
return {-1.0, -1.0};
if (!open_file_success(bam_index, (bam_file_name + ".bai").c_str()))
return {-1.0, -1.0};

seqan::BamHeader bam_header;
seqan::BamHeader bam_header{};
seqan::readHeader(bam_header, bam_file);

std::vector<double> insert_sizes;
seqan::BamAlignmentRecord record;
std::vector<double> insert_sizes{};
seqan::BamAlignmentRecord record{};
bool hasAlignments = false;

int rID = 0;
Expand Down
29 changes: 17 additions & 12 deletions include/sviper/sviper.h
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ seqan::ArgumentParser::ParseResult parseCommandLine(CmdOptions & options, int ar

bool polish_variant(Variant & var, input_output_information & info)
{
std::stringstream localLog;
std::stringstream localLog{};
seqan::BamFileIn & short_read_bam = *(info.short_read_file_handles[omp_get_thread_num()]);
seqan::BamFileIn & long_read_bam = *(info.long_read_file_handles[omp_get_thread_num()]);
seqan::FaiIndex & faiIndex = *(info.faidx_file_handles[omp_get_thread_num()]);
Expand Down Expand Up @@ -187,8 +187,8 @@ bool polish_variant(Variant & var, input_output_information & info)
SEQAN_ASSERT_LEQ(ref_region_start, ref_region_end);

// get reference id in bam File
unsigned rID_short;
unsigned rID_long;
unsigned rID_short{};
unsigned rID_long{};

if (!seqan::getIdByName(rID_short, seqan::contigNamesCache(seqan::context(short_read_bam)), var.ref_chrom))
{
Expand All @@ -212,10 +212,10 @@ bool polish_variant(Variant & var, input_output_information & info)

// Extract long reads
// ---------------------------------------------------------------------
std::vector<seqan::BamAlignmentRecord> supporting_records;
std::vector<seqan::BamAlignmentRecord> supporting_records{};

{
std::vector<seqan::BamAlignmentRecord> long_reads;
std::vector<seqan::BamAlignmentRecord> long_reads{};

// extract overlapping the start breakpoint +-50 bp's
seqan::viewRecords(long_reads, long_read_bam, info.long_read_bai, rID_long, var_ref_pos_sub50, var_ref_pos_add50);
Expand Down Expand Up @@ -295,7 +295,7 @@ bool polish_variant(Variant & var, input_output_information & info)
// ---------------------------------------------------------------------
localLog << "--- Cropping long reads with a buffer of +-" << info.cmd_options.flanking_region << " around variants." << std::endl;

seqan::StringSet<seqan::Dna5String> supporting_sequences;
seqan::StringSet<seqan::Dna5String> supporting_sequences{};
std::vector<seqan::BamAlignmentRecord>::size_type maximum_long_reads = 5;

// sort records such that the highest quality ones are chosen first
Expand All @@ -304,6 +304,11 @@ bool polish_variant(Variant & var, input_output_information & info)
for (unsigned i = 0; i < std::min(maximum_long_reads, supporting_records.size()); ++i)
{
auto region = get_read_region_boundaries(supporting_records[i], ref_region_start, ref_region_end);

assert(std::get<0>(region) >= 0);
assert(std::get<1>(region) >= 0);
assert(std::get<0>(region) <= std::get<1>(region));

seqan::Dna5String reg = seqan::infix(supporting_records[i].seq, std::get<0>(region), std::get<1>(region));

// For deletions, the expected size of the subsequence is that of
Expand Down Expand Up @@ -343,7 +348,7 @@ bool polish_variant(Variant & var, input_output_information & info)

// Build consensus of supporting read regions
// ---------------------------------------------------------------------
std::vector<double> mapping_qualities;
std::vector<double> mapping_qualities{};
mapping_qualities.resize(supporting_records.size());
for (unsigned i = 0; i < supporting_records.size(); ++i)
mapping_qualities[i] = (supporting_records[i]).mapQ;
Expand All @@ -352,18 +357,18 @@ bool polish_variant(Variant & var, input_output_information & info)

localLog << "--- Built a consensus with a MSA of length " << seqan::length(cns) << "." << std::endl;

seqan::Dna5String polished_ref;
seqan::Dna5String polished_ref{};
SViperConfig config{info.cmd_options};
config.ref_flank_length = 500;

{
seqan::StringSet<seqan::Dna5QString> short_reads_1; // reads (first in pair)
seqan::StringSet<seqan::Dna5QString> short_reads_2; // mates (second in pair)
seqan::StringSet<seqan::Dna5QString> short_reads_1{}; // reads (first in pair)
seqan::StringSet<seqan::Dna5QString> short_reads_2{}; // mates (second in pair)

{
// Extract short reads in region
// ---------------------------------------------------------------------
std::vector<seqan::BamAlignmentRecord> short_reads;
std::vector<seqan::BamAlignmentRecord> short_reads{};
// If the breakpoints are farther apart then illumina-read-length + 2 * flanking-region,
// then extract reads for each break point separately.
if (ref_region_end - ref_region_start > info.cmd_options.flanking_region * 2 + info.cmd_options.length_of_short_reads)
Expand Down Expand Up @@ -438,7 +443,7 @@ bool polish_variant(Variant & var, input_output_information & info)

// Align polished sequence to reference
// ---------------------------------------------------------------------
seqan::Dna5String ref_part;
seqan::Dna5String ref_part{};
seqan::readRegion(ref_part, faiIndex, ref_fai_idx,
std::max(1u, ref_region_start - config.ref_flank_length),
std::min(ref_region_end + config.ref_flank_length, static_cast<unsigned>(ref_length)));
Expand Down
Loading

0 comments on commit 72d1e0c

Please sign in to comment.