diff --git a/processbam.c b/processbam.c index 4fd2e0e..f251eaa 100644 --- a/processbam.c +++ b/processbam.c @@ -28,8 +28,9 @@ void load_bam( parameters *params, configuration *cfg, bam_info* in_bam, char* p int* fragment_size_total; float* variance; char* library_name = NULL; - int i, j, library_index, diff, return_value; + int i, j, k, library_index, diff, return_value, temp; char fai_file[MAX_SEQ]; + double count; fprintf( stderr, "Processing BAM file %s.\n", path); @@ -100,10 +101,10 @@ void load_bam( parameters *params, configuration *cfg, bam_info* in_bam, char* p /* For SAMPLEFRAG number of alignments, store the template length field. Performed for each different library */ fragment_size = ( int**) getMem( in_bam->num_libraries * sizeof( int*)); - + for( i = 0; i < in_bam->num_libraries; i++) { - fragment_size[i] = NULL; + fragment_size[i] = NULL; fragment_size[i] = ( int*) getMem( SAMPLEFRAG * sizeof( int)); } @@ -163,8 +164,23 @@ void load_bam( parameters *params, configuration *cfg, bam_info* in_bam, char* p /* Sort the fragment sizes */ qsort( fragment_size[i], SAMPLEFRAG, sizeof( int), compare_size_int); + /* When the library does not have sufficient number of reads, we discard the leading 0s when calculating the median */ + count = 0; + if( fragments_sampled[i] < SAMPLEFRAG) + { + for( k = 0; k < SAMPLEFRAG; k++) + { + if( fragment_size[i][k] == 0) + count++; + else + break; + } + } /* Get the medians */ - ( in_bam->libraries)[i]->frag_med = fragment_size[i][( SAMPLEFRAG / 2) - 1]; + temp = ( fragments_sampled[i] / 2) - 1 + (int) count; + ( in_bam->libraries)[i]->frag_med = fragment_size[i][temp]; + + //fprintf(stderr,"There are %d 0s, %d fragments, index %d and median is %d\n", (int) count, fragments_sampled[i], temp, ( in_bam->libraries)[i]->frag_med); } /* Find the fragment sizes which pass the second test, and will contribute to the avg and std */ @@ -195,7 +211,6 @@ void load_bam( parameters *params, configuration *cfg, bam_info* in_bam, char* p second_test_pass[i] = second_test_pass[i] + 1; } } - } for( i = 0; i < in_bam->num_libraries; i++) @@ -404,6 +419,15 @@ void create_fastq( bam_info* in_bam, char* bam_path, parameters* params) } } +void is_library_valid( struct library_properties* in_lib) +{ + if( in_lib->frag_avg >=0 && in_lib->frag_std >=0 && in_lib->read_length >= 0 && in_lib->conc_min>=0 && in_lib->conc_max >=0) + in_lib->is_valid = true; + else + in_lib->is_valid = false; +} + + void set_library_min_max( struct library_properties* in_lib) { in_lib->conc_min = in_lib->frag_avg - ( 4 * in_lib->frag_std); diff --git a/processfq.h b/processfq.h index 00b99ad..e1a7f9c 100644 --- a/processfq.h +++ b/processfq.h @@ -36,6 +36,7 @@ typedef struct library_properties char *divet; /* file name for the DIVET file post mrfast mapping */ int read_length; /* length of reads for this library */ int num_sequences; /* number of paired-end sequences for this library */ + bool is_valid; /* Checks if the library is valid, i.e., mean, conc_min, conc_max, etc. */ struct softClip *listSoftClip; struct discordantMapping **mappings_discordant; diff --git a/variants.c b/variants.c index 3677bb1..b55424d 100644 --- a/variants.c +++ b/variants.c @@ -183,7 +183,7 @@ void print_strvar( bam_info** in_bams, parameters* params, struct strvar* sv, FI /* Find ref and alt sequences */ seq = readRefAltSeq( params, sv->chr_name, sv->inner_start, sv->inner_end); - if( params->seq_resolved != 0 && seq != NULL) + if( ( params->seq_resolved != 0 || sv->imprecise == NOT_IMPRECISE) && seq != NULL) { fprintf( fpOut, "%s\t%i\t%s%d\t%s\t%c\t255\t%s\t", sv->chr_name, ( sv->inner_start) + 1, "vh_del_", ++del_cnt, seq, seq[0], ( sv->filtered == false) ? "PASS" : "LowQual"); } @@ -232,7 +232,7 @@ void print_strvar( bam_info** in_bams, parameters* params, struct strvar* sv, FI { seq = readRefAltSeq( params, sv->chr_name, sv->outer_start, sv->outer_end); - if( params->seq_resolved != 0 && seq != NULL) + if( ( params->seq_resolved != 0 || sv->imprecise == NOT_IMPRECISE) && seq != NULL) { /* Find ref and alt sequences */ seq_rev = reverseComplement( seq); @@ -256,11 +256,11 @@ void print_strvar( bam_info** in_bams, parameters* params, struct strvar* sv, FI else if( sv->svtype == MEIFORWARD) { seq = readRefAltSeqMEI( params, sv->chr_name, sv->mei_name); - if(seq != NULL) - sv_len = strlen(seq); + if( seq != NULL) + sv_len = strlen(seq); - if( params->seq_resolved != 0) + if( params->seq_resolved != 0 || sv->imprecise == NOT_IMPRECISE) { if( seq != NULL) fprintf( fpOut, "%s\t%i\t%s%d\t%c\t%s\t%d\t%s\t", sv->chr_name, ( sv->outer_start) + 1, "vh_mei_", ++mei_cnt, seq[0], seq, 255, ( sv->filtered == false) ? "PASS" : "mfilt"); @@ -287,7 +287,7 @@ void print_strvar( bam_info** in_bams, parameters* params, struct strvar* sv, FI if(seq != NULL) sv_len = strlen(seq); - if( params->seq_resolved != 0) + if( params->seq_resolved != 0 || sv->imprecise == NOT_IMPRECISE) { if( seq != NULL) { @@ -332,7 +332,7 @@ void print_strvar( bam_info** in_bams, parameters* params, struct strvar* sv, FI { seq = readRefAltSeq( params, sv->chr_name, sv->outer_start, sv->outer_end); - if( params->seq_resolved != 0 && seq != NULL) + if( ( params->seq_resolved != 0 || sv->imprecise == NOT_IMPRECISE) && seq != NULL) { fprintf( fpOut, "%s\t%i\t%s%d\t%c\t%s\t%d\t%s\t", sv->chr_name, ( sv->outer_start) + 1," vh_dup_", ++dup_cnt, seq[0], seq, 255, ( sv->filtered == false) ? "PASS" : "LowQual"); } @@ -350,52 +350,92 @@ void print_strvar( bam_info** in_bams, parameters* params, struct strvar* sv, FI } else if( sv->svtype == INVDUPRIGHT) { - if( params->seq_resolved != 0) + if( sv->inner_start > sv->outer_start) + seq = readRefAltSeq( params, sv->chr_name, sv->outer_start, sv->inner_start); + else + seq = readRefAltSeq( params, sv->chr_name, sv->inner_start, sv->outer_start); + + if( ( params->seq_resolved != 0 || sv->imprecise == NOT_IMPRECISE) && seq != NULL) { - fprintf( fpOut,"%s\t%i\t%s%d\t%s\t%s\t%d\t%s\t", sv->chr_name, ( sv->outer_start) + 1," vh_dup_", ++dup_cnt, ".", ".", 255, ( sv->filtered == false) ? "PASS" : "LowQual"); + fprintf( fpOut,"%s\t%i\t%s%d\t%c\t%s\t%d\t%s\t", sv->chr_name, ( sv->outer_start) + 1," vh_dup_", ++dup_cnt, seq[0], seq, 255, ( sv->filtered == false) ? "PASS" : "LowQual"); } + else if( seq != NULL) + fprintf( fpOut,"%s\t%i\t%s%d\t%c\t%s%s%s\t%d\t%s\t", sv->chr_name, ( sv->outer_start) + 1," vh_dup_", ++dup_cnt, seq[0], "<", "DUP:ISP", ">", 255, ( sv->filtered == false) ? "PASS" : "LowQual"); else fprintf( fpOut,"%s\t%i\t%s%d\t%s\t%s%s%s\t%d\t%s\t", sv->chr_name, ( sv->outer_start) + 1," vh_dup_", ++dup_cnt, ".", "<", "DUP:ISP", ">", 255, ( sv->filtered == false) ? "PASS" : "LowQual"); + if( seq != NULL) + free( seq); + invdup_cnt++; sv_len = abs( ( sv->inner_start - sv->outer_start + 1)); total_invdup_length += sv_len; } else if( sv->svtype == INVDUPLEFT ) { - if( params->seq_resolved != 0) + if( sv->outer_end > sv->inner_end) + seq = readRefAltSeq( params, sv->chr_name, sv->inner_end, sv->outer_end); + else + seq = readRefAltSeq( params, sv->chr_name, sv->outer_end, sv->inner_end); + + if( ( params->seq_resolved != 0 || sv->imprecise == NOT_IMPRECISE) && seq != NULL) { - fprintf( fpOut,"%s\t%i\t%s%d\t%s\t%s\t%d\t%s\t", sv->chr_name, ( sv->inner_end) + 1," vh_dup_", ++dup_cnt, ".", ".", 255, ( sv->filtered == false) ? "PASS" : "LowQual"); + fprintf( fpOut,"%s\t%i\t%s%d\t%c\t%s\t%d\t%s\t", sv->chr_name, ( sv->inner_end) + 1," vh_dup_", ++dup_cnt, seq[0], seq, 255, ( sv->filtered == false) ? "PASS" : "LowQual"); } + else if( seq != NULL) + fprintf( fpOut,"%s\t%i\t%s%d\t%c\t%s%s%s\t%d\t%s\t", sv->chr_name, ( sv->inner_end) + 1," vh_dup_", ++dup_cnt, seq[0], "<", "DUP:ISP", ">", 255, ( sv->filtered == false) ? "PASS" : "LowQual"); else fprintf( fpOut,"%s\t%i\t%s%d\t%s\t%s%s%s\t%d\t%s\t", sv->chr_name, ( sv->inner_end) + 1," vh_dup_", ++dup_cnt, ".", "<", "DUP:ISP", ">", 255, ( sv->filtered == false) ? "PASS" : "LowQual"); + if( seq != NULL) + free( seq); + invdup_cnt++; sv_len = abs( ( sv->inner_end - sv->outer_end + 1)); total_invdup_length += sv_len; } else if( sv->svtype == INTERDUPRIGHT) { - if( params->seq_resolved != 0) + if( sv->inner_start > sv->outer_start) + seq = readRefAltSeq( params, sv->chr_name, sv->outer_start, sv->inner_start); + else + seq = readRefAltSeq( params, sv->chr_name, sv->inner_start, sv->outer_start); + + if( ( params->seq_resolved != 0 || sv->imprecise == NOT_IMPRECISE) && seq != NULL) { - fprintf( fpOut,"%s\t%i\t%s%d\t%s\t%s\t%d\t%s\t", sv->chr_name, ( sv->outer_start) + 1," vh_dup_", ++dup_cnt, ".", ".", 255, ( sv->filtered == false) ? "PASS" : "LowQual"); + fprintf( fpOut,"%s\t%i\t%s%d\t%c\t%s\t%d\t%s\t", sv->chr_name, ( sv->outer_start) + 1," vh_dup_", ++dup_cnt, seq[0], seq, 255, ( sv->filtered == false) ? "PASS" : "LowQual"); } + else if( seq!= NULL) + fprintf( fpOut,"%s\t%i\t%s%d\t%c\t%s%s%s\t%d\t%s\t", sv->chr_name, ( sv->outer_start) + 1," vh_dup_", ++dup_cnt, seq[0], "<", "DUP:ISP", ">", 255, ( sv->filtered == false) ? "PASS" : "LowQual"); else fprintf( fpOut,"%s\t%i\t%s%d\t%s\t%s%s%s\t%d\t%s\t", sv->chr_name, ( sv->outer_start) + 1," vh_dup_", ++dup_cnt, ".", "<", "DUP:ISP", ">", 255, ( sv->filtered == false) ? "PASS" : "LowQual"); + if( seq != NULL) + free( seq); + interdup_cnt++; sv_len = abs( ( sv->inner_start - sv->outer_start + 1)); total_interdup_length += sv_len; } else if( sv->svtype == INTERDUPLEFT ) { - if( params->seq_resolved != 0) + if( sv->outer_end > sv->inner_end) + seq = readRefAltSeq( params, sv->chr_name, sv->inner_end, sv->outer_end); + else + seq = readRefAltSeq( params, sv->chr_name, sv->outer_end, sv->inner_end); + + if( ( params->seq_resolved != 0 || sv->imprecise == NOT_IMPRECISE) && seq != NULL) { - fprintf( fpOut,"%s\t%i\t%s%d\t%s\t%s\t%d\t%s\t", sv->chr_name, ( sv->inner_end) + 1," vh_dup_", ++dup_cnt, ".", ".", 255, ( sv->filtered == false) ? "PASS" : "LowQual"); + fprintf( fpOut,"%s\t%i\t%s%d\t%c\t%s\t%d\t%s\t", sv->chr_name, ( sv->inner_end) + 1," vh_dup_", ++dup_cnt, seq[0], seq, 255, ( sv->filtered == false) ? "PASS" : "LowQual"); } + else if( seq != NULL) + fprintf( fpOut,"%s\t%i\t%s%d\t%c\t%s%s%s\t%d\t%s\t", sv->chr_name, ( sv->inner_end) + 1," vh_dup_", ++dup_cnt, seq[0], "<", "DUP:ISP", ">", 255, ( sv->filtered == false) ? "PASS" : "LowQual"); else fprintf( fpOut,"%s\t%i\t%s%d\t%s\t%s%s%s\t%d\t%s\t", sv->chr_name, ( sv->inner_end) + 1," vh_dup_", ++dup_cnt, ".", "<", "DUP:ISP", ">", 255, ( sv->filtered == false) ? "PASS" : "LowQual"); + if( seq != NULL) + free( seq); + interdup_cnt++; sv_len = abs( ( sv->inner_end - sv->outer_end + 1)); total_interdup_length += sv_len; diff --git a/vh/vh_divethandler.c b/vh/vh_divethandler.c index a26763e..1540a92 100644 --- a/vh/vh_divethandler.c +++ b/vh/vh_divethandler.c @@ -371,9 +371,9 @@ DivetRow *vh_loadDivetRowFromBamAlternative( alternativeMapping *discordantReadP //fprintf( divet_file, "%s\t%s\t%d\t%d\t%c\t=\t%d\t%d\t%c\t%c\t%d\t%d\t%d\n", discordantReadPtr->readName, discordantReadPtr->chromosome_name, - // discordantReadPtr->pos1, discordantReadPtr->pos1_End, orientLeft, discordantReadPtr->pos2, - //discordantReadPtr->pos2_End, orientRight, SV, discordantReadPtr->editDistance, - //discordantReadPtr->mQual1, discordantReadPtr->mQual2); + // discordantReadPtr->pos1, discordantReadPtr->pos1_End, orientLeft, discordantReadPtr->pos2, + //discordantReadPtr->pos2_End, orientRight, SV, discordantReadPtr->editDistance, + //discordantReadPtr->mQual1, discordantReadPtr->mQual2); DivetRow *newRow = createDivetRowNew (libInfo->hash, discordantReadPtr->ten_x_barcode, discordantReadPtr->readName, discordantReadPtr->chromosome_name, discordantReadPtr->pos1, @@ -515,8 +515,8 @@ int read_Divet_bam( discordantMapping **mapping, parameters *params, LibraryInfo while( discordantReadPtr != NULL) { is_satellite = sonic_is_satellite( params->this_sonic, discordantReadPtr->chromosome_name , discordantReadPtr->pos1, discordantReadPtr->pos1_End) - + sonic_is_satellite( params->this_sonic, discordantReadPtr->chromosome_name, discordantReadPtr->pos2, discordantReadPtr->pos2_End) - + sonic_is_satellite( params->this_sonic, discordantReadPtr->chromosome_name , discordantReadPtr->pos2, discordantReadPtr->pos2_End); + + sonic_is_satellite( params->this_sonic, discordantReadPtr->chromosome_name, discordantReadPtr->pos2, discordantReadPtr->pos2_End) + + sonic_is_satellite( params->this_sonic, discordantReadPtr->chromosome_name , discordantReadPtr->pos2, discordantReadPtr->pos2_End); if ( is_satellite == 0 && discordantReadPtr->mQual1 > params->mq_threshold && discordantReadPtr->mQual2 > params->mq_threshold && !sonic_is_gap( params->this_sonic, params->this_sonic->chromosome_names[chr_index], discordantReadPtr->pos1, discordantReadPtr->pos2) @@ -626,9 +626,9 @@ int read_Divet_bam_alternative( alternativeMapping **mapping, parameters *params /* Since MT is circular, we need to eliminate the read-pairs at both ends of the chromosome */ insLen = abs( discordantReadPtr->pos1 - discordantReadPtr->pos2); is_satellite = sonic_is_satellite( params->this_sonic, discordantReadPtr->chromosome_name , discordantReadPtr->pos1, discordantReadPtr->pos1_End) - + sonic_is_satellite( params->this_sonic, discordantReadPtr->chromosome_name, discordantReadPtr->pos2, discordantReadPtr->pos2_End) - + sonic_is_satellite( params->this_sonic, discordantReadPtr->chromosome_name , discordantReadPtr->pos2, discordantReadPtr->pos2_End) - + sonic_is_satellite( params->this_sonic, discordantReadPtr->chromosome_name, discordantReadPtr->pos2, discordantReadPtr->pos2_End); + + sonic_is_satellite( params->this_sonic, discordantReadPtr->chromosome_name, discordantReadPtr->pos2, discordantReadPtr->pos2_End) + + sonic_is_satellite( params->this_sonic, discordantReadPtr->chromosome_name , discordantReadPtr->pos2, discordantReadPtr->pos2_End) + + sonic_is_satellite( params->this_sonic, discordantReadPtr->chromosome_name, discordantReadPtr->pos2, discordantReadPtr->pos2_End); if ( is_satellite == 0 && ( discordantReadPtr->mQual1 > params->mq_threshold) && ( !sonic_is_gap( params->this_sonic, params->this_sonic->chromosome_names[chr_index], discordantReadPtr->pos1, discordantReadPtr->pos2)) && @@ -826,13 +826,13 @@ int load_Divet_bam( bam_info** in_bams, parameters *params, int chr_index) g_maxListBrkPointIntr = MAXLISTBRKPOINTINTR; g_libInfo = NULL; -/* + /* divetfile_path = (char *) getMem(sizeof(char) * (2+strlen("divv.vh")+strlen(params->outprefix))); sprintf(divetfile_path, "%s-%s", params->outprefix, "divv.vh"); divet_file = safe_fopen (divetfile_path, "w"); free(divetfile_path); - */ + */ del_cnt_div = 0, ins_cnt_div = 0, inv_cnt_div = 0, tandup_cnt_div = 0, sr_cnt_div = 0, alt_cnt_div = 0, invdup_cnt_div = 0, interdup_cnt_div = 0; @@ -840,6 +840,9 @@ int load_Divet_bam( bam_info** in_bams, parameters *params, int chr_index) { for( lib_cnt = 0; lib_cnt < in_bams[bam_index]->num_libraries; lib_cnt++) { + if( in_bams[bam_index]->libraries[lib_cnt]->is_valid == false) + continue; + newLibInfo = ( struct LibraryInfo *) getMem( sizeof( struct LibraryInfo)); strcpy( newLibInfo->libName, in_bams[bam_index]->libraries[lib_cnt]->libname); strcpy( newLibInfo->indName, in_bams[bam_index]->sample_name); @@ -882,6 +885,9 @@ int load_Divet_bam( bam_info** in_bams, parameters *params, int chr_index) { for( lib_cnt = 0; lib_cnt < in_bams[bam_index]->num_libraries; lib_cnt++) { + if( in_bams[bam_index]->libraries[lib_cnt]->is_valid == false) + continue; + newLibInfo = ( struct LibraryInfo *) getMem( sizeof( struct LibraryInfo)); strcpy( newLibInfo->libName, "Alternative\0"); strcpy( newLibInfo->indName, in_bams[bam_index]->sample_name); @@ -926,6 +932,9 @@ int load_Divet_bam( bam_info** in_bams, parameters *params, int chr_index) { for( lib_cnt = 0; lib_cnt < in_bams[bam_index]->num_libraries; lib_cnt++) { + if( in_bams[bam_index]->libraries[lib_cnt]->is_valid == false) + continue; + newLibInfo = ( struct LibraryInfo *) getMem( sizeof( struct LibraryInfo)); strcpy( newLibInfo->libName, "SplitRead\0"); strcpy( newLibInfo->indName, in_bams[bam_index]->sample_name);