From 1425e5d6866e08b5e46898a2a828cdbd0cdc2a75 Mon Sep 17 00:00:00 2001 From: asylvz Date: Thu, 10 Sep 2020 14:10:39 +0300 Subject: [PATCH] Fix for MEI --- variants.c | 39 +++++++++++++++++++++---------------- variants.h | 1 + vh/vh_setcover.c | 50 ++++++++++++++++++++++++++++++++++++------------ 3 files changed, 61 insertions(+), 29 deletions(-) diff --git a/variants.c b/variants.c index a608209..c9b3799 100644 --- a/variants.c +++ b/variants.c @@ -139,14 +139,14 @@ char* reverseComplement( char* str) void print_sv_stats() { - fprintf(logFile,"\n\nTARDIS found %d SVs total\n", del_cnt + ins_cnt + inv_cnt + tandup_cnt + mei_cnt + interdup_cnt + invdup_cnt); + fprintf(logFile,"\n\nTARDIS found %d SVs total\n", del_cnt + ins_cnt + inv_cnt + tandup_cnt + mei_cnt - mei_cnt_filtered + interdup_cnt + invdup_cnt); fprintf(logFile,"\tDeletion: %d\n", del_cnt); fprintf(logFile,"\tInsertion: %d\n", ins_cnt); fprintf(logFile,"\tInversion: %d\n", inv_cnt); fprintf(logFile,"\tTandem Duplication: %d\n", tandup_cnt); fprintf(logFile,"\tInterspersed Duplication: %d\n", interdup_cnt); fprintf(logFile,"\tInterspersed (Inverted) Duplication: %d\n", invdup_cnt); - fprintf(logFile,"\tMEI: %d (%d filtered)\n", mei_cnt, mei_cnt_filtered); + fprintf(logFile,"\tMEI: %d (%d filtered)\n", mei_cnt - mei_cnt_filtered, mei_cnt_filtered); fprintf(logFile,"\tNUMT: %d\n", numt_cnt); fprintf(stderr,"\n\nTARDIS is complete. Found %d SVs total\n", del_cnt + ins_cnt + inv_cnt + tandup_cnt + mei_cnt + interdup_cnt + invdup_cnt); @@ -156,7 +156,7 @@ void print_sv_stats() fprintf(stderr,"\tTandem Duplication: %d\n", tandup_cnt); fprintf(stderr,"\tInterspersed Duplication: %d\n", interdup_cnt); fprintf(stderr,"\tInterspersed (Inverted) Duplication: %d\n", invdup_cnt); - fprintf(stderr,"\tMEI: %d (%d filtered)\n", mei_cnt, mei_cnt_filtered); + fprintf(stderr,"\tMEI: %d (%d filtered)\n", mei_cnt - mei_cnt_filtered, mei_cnt_filtered); fprintf(stderr,"\tNUMT: %d\n", numt_cnt); } @@ -250,6 +250,9 @@ void print_strvar( bam_info** in_bams, parameters* params, struct strvar* sv, FI if( seq != NULL) free( seq); + if( sv->filtered != false) + mei_cnt_filtered++; + sv_len = abs( ( sv->inner_end - sv->inner_start + 1)); total_inv_length += sv_len; } @@ -258,7 +261,9 @@ void print_strvar( bam_info** in_bams, parameters* params, struct strvar* sv, FI seq = readRefAltSeqMEI( params, sv->chr_name, sv->mei_name); if( seq != NULL) - sv_len = strlen(seq); + sv_len = strlen(seq) + abs(sv->inner_start - sv->outer_start); + else + sv_len = abs(sv->inner_end - sv->inner_start) + abs(sv->inner_start - sv->outer_start); if( params->seq_resolved != 0 || sv->imprecise == NOT_IMPRECISE) { @@ -266,9 +271,6 @@ void print_strvar( bam_info** in_bams, parameters* params, struct strvar* sv, FI fprintf( fpOut, "%s\t%i\t%s%d\t%c\t%s\t%d\t%s\t", sv->chr_name, ( sv->outer_start) + 1, "vh_mei_", ++mei_cnt, seq[0], seq, 255, ( sv->filtered == false) ? "PASS" : "mfilt"); else fprintf( fpOut, "%s\t%i\t%s%d\t%s\t%s\t%d\t%s\t", sv->chr_name, ( sv->outer_start) + 1, "vh_mei_", ++mei_cnt, ".", ".", 255, ( sv->filtered == false) ? "PASS" : "mfilt"); - - if( sv->filtered) - mei_cnt_filtered++; } else if( seq != NULL) fprintf( fpOut, "%s\t%i\t%s%d\t%c\t%s%s%s%s\t%d\t%s\t", sv->chr_name, ( sv->outer_start) + 1, "vh_mei_", ++mei_cnt, seq[0], "<", "INS:ME:", sv->mei_name, ">", 255, ( sv->filtered == false) ? "PASS" : "mfilt"); @@ -278,26 +280,30 @@ void print_strvar( bam_info** in_bams, parameters* params, struct strvar* sv, FI if( seq != NULL) free( seq); - //sv_len = abs( ( sv->outer_end - sv->inner_end + 1)); + if( sv->filtered != false) + mei_cnt_filtered++; + total_mei_length += sv_len; } else if( sv->svtype == MEIREVERSE) { seq = readRefAltSeqMEI( params, sv->chr_name, sv->mei_name); if(seq != NULL) - sv_len = strlen(seq); + sv_len = strlen(seq) + abs(sv->inner_start - sv->outer_start); + else + sv_len = abs(sv->inner_end - sv->inner_start) + abs(sv->inner_start - sv->outer_start); if( params->seq_resolved != 0 || sv->imprecise == NOT_IMPRECISE) { if( seq != NULL) { seq_rev = reverseComplement( seq); - fprintf( fpOut, "%s\t%i\t%s%d\t%c\t%s\t%d\t%s\t", sv->chr_name, ( sv->outer_end) + 1, "vh_mei_", ++mei_cnt, seq[0], seq_rev, 255, ( sv->filtered == false) ? "PASS" : "mfilt"); + fprintf( fpOut, "%s\t%i\t%s%d\t%c\t%s\t%d\t%s\t", sv->chr_name, ( sv->outer_start) + 1, "vh_mei_", ++mei_cnt, seq[0], seq_rev, 255, ( sv->filtered == false) ? "PASS" : "mfilt"); } else { seq_rev = NULL; - fprintf( fpOut, "%s\t%i\t%s%d\t%s\t%s\t%d\t%s\t", sv->chr_name, ( sv->outer_end) + 1, "vh_mei_", ++mei_cnt, ".", ".", 255, ( sv->filtered == false) ? "PASS" : "mfilt"); + fprintf( fpOut, "%s\t%i\t%s%d\t%s\t%s\t%d\t%s\t", sv->chr_name, ( sv->outer_start) + 1, "vh_mei_", ++mei_cnt, ".", ".", 255, ( sv->filtered == false) ? "PASS" : "mfilt"); } if( sv->filtered) mei_cnt_filtered++; @@ -306,14 +312,13 @@ void print_strvar( bam_info** in_bams, parameters* params, struct strvar* sv, FI free( seq_rev); } else if( seq != NULL) - fprintf( fpOut, "%s\t%i\t%s%d\t%c\t%s%s%s%s\t%d\t%s\t", sv->chr_name, ( sv->outer_end) + 1, "vh_mei_", ++mei_cnt, seq[0], "<", "INS:ME:", sv->mei_name, ">", 255, ( sv->filtered == false) ? "PASS" : "mfilt"); + fprintf( fpOut, "%s\t%i\t%s%d\t%c\t%s%s%s%s\t%d\t%s\t", sv->chr_name, ( sv->outer_start) + 1, "vh_mei_", ++mei_cnt, seq[0], "<", "INS:ME:", sv->mei_name, ">", 255, ( sv->filtered == false) ? "PASS" : "mfilt"); else - fprintf( fpOut, "%s\t%i\t%s%d\t%s\t%s%s%s%s\t%d\t%s\t", sv->chr_name, ( sv->outer_end) + 1, "vh_mei_", ++mei_cnt, ".", "<", "INS:ME:", sv->mei_name, ">", 255, ( sv->filtered == false) ? "PASS" : "mfilt"); + fprintf( fpOut, "%s\t%i\t%s%d\t%s\t%s%s%s%s\t%d\t%s\t", sv->chr_name, ( sv->outer_start) + 1, "vh_mei_", ++mei_cnt, ".", "<", "INS:ME:", sv->mei_name, ">", 255, ( sv->filtered == false) ? "PASS" : "mfilt"); if( seq != NULL) free( seq); - //sv_len = abs( ( sv->outer_end - sv->inner_end + 1)); total_mei_length += sv_len; } else if( sv->svtype == NUMTFORWARD || sv->svtype == NUMTREVERSE) @@ -445,8 +450,8 @@ void print_strvar( bam_info** in_bams, parameters* params, struct strvar* sv, FI if( sv->svtype == MEIFORWARD) fprintf( fpOut, "END=%d;SVLEN=%d;MEINFO=%s;RPSUP=%d;SRSUP=%d;", (sv->outer_start) + sv_len + 1, sv_len, sv->mei_name, rp_total, sr_total); - if( sv->svtype == MEIREVERSE) - fprintf( fpOut, "END=%d;SVLEN=%d;MEINFO=%s;RPSUP=%d;SRSUP=%d;", (sv->outer_end) + sv_len + 1, sv_len, sv->mei_name, rp_total, sr_total); + else if( sv->svtype == MEIREVERSE) + fprintf( fpOut, "END=%d;SVLEN=%d;MEINFO=%s;RPSUP=%d;SRSUP=%d;", (sv->outer_start) + sv_len + 1, sv_len, sv->mei_name, rp_total, sr_total); else if( sv->svtype == INVERSION) fprintf( fpOut, "END=%d;SVLEN=%d;RPSUP=%d;SRSUP=%d;", (sv->outer_end) + 1, sv_len, rp_total, sr_total); else if( sv->svtype == INVDUPLEFT || sv->svtype == INTERDUPLEFT) @@ -478,7 +483,7 @@ void print_strvar( bam_info** in_bams, parameters* params, struct strvar* sv, FI fprintf( fpOut, "SVTYPE=DEL:ME:%s\t", sv->mei_type); else if( sv->svtype == DELETION) fprintf( fpOut, "SVTYPE=DEL\t"); - else if( sv->svtype == MEIFORWARD || sv->svtype == MEIREVERSE) + else if(sv->svtype == MEIREVERSE || sv->svtype == MEIFORWARD) fprintf( fpOut, "SVTYPE=INS:ME:%s\t", sv->mei_type); else if( sv->svtype == NUMTFORWARD || sv->svtype == NUMTREVERSE) fprintf( fpOut, "SVTYPE=INS:MT\t"); diff --git a/variants.h b/variants.h index 073367d..97cd97d 100644 --- a/variants.h +++ b/variants.h @@ -60,4 +60,5 @@ void free_chr(struct strvar ** variations, char* chr_); void print_vcf_header( FILE *fpOut, bam_info** in_bams, parameters *params); int print_chr(struct strvar ** variations, parameters *params, char* chr_, FILE *fpOut); void print_sv_stats(); +char* readRefAltSeqMEI( parameters *params, char* chr_name, char *mei_string); #endif diff --git a/vh/vh_setcover.c b/vh/vh_setcover.c index ad3d2d0..1d0ada0 100644 --- a/vh/vh_setcover.c +++ b/vh/vh_setcover.c @@ -716,9 +716,8 @@ void outputCluster( bam_info** in_bams, parameters* params, int cluster_id, FILE { int i; char mobileName[strSize]; - - float is_start_satellite, is_end_satellite, is_mid_satellite; - sonic_repeat *mei_start, *mei_end, *mei_mid; + float is_start_satellite, is_end_satellite, is_mid_satellite, is_all_satellite; + sonic_repeat *mei_start, *mei_end, *mei_mid, *mei_all; bool MEI_Filter = false; struct strvar* var_example = NULL; @@ -731,19 +730,32 @@ void outputCluster( bam_info** in_bams, parameters* params, int cluster_id, FILE in_bams[i]->contribution = true; } - is_start_satellite = sonic_is_satellite(params->this_sonic, listClusterEl[cluster_id].chromosome_name, listClusterEl[cluster_id].posStartSV_Outer, listClusterEl[cluster_id].posStartSV_Outer + 1); - is_end_satellite = sonic_is_satellite(params->this_sonic, listClusterEl[cluster_id].chromosome_name, listClusterEl[cluster_id].posEndSV_Outer - 1, listClusterEl[cluster_id].posEndSV_Outer); - is_mid_satellite = sonic_is_satellite(params->this_sonic, listClusterEl[cluster_id].chromosome_name, (int)( listClusterEl[cluster_id].posStartSV_Outer + listClusterEl[cluster_id].posEndSV_Outer) / 2, - (int)( listClusterEl[cluster_id].posStartSV_Outer + listClusterEl[cluster_id].posEndSV_Outer) / 2 + 1); + if( listClusterEl[cluster_id].SVtype == MEIFORWARD || listClusterEl[cluster_id].SVtype == MEIREVERSE) + { + + is_all_satellite = sonic_is_satellite(params->this_sonic, listClusterEl[cluster_id].chromosome_name, listClusterEl[cluster_id].posStartSV_Outer, listClusterEl[cluster_id].posEndSV_Outer); + is_start_satellite = sonic_is_satellite(params->this_sonic, listClusterEl[cluster_id].chromosome_name, listClusterEl[cluster_id].posStartSV_Outer, listClusterEl[cluster_id].posStartSV_Outer + 1); + is_end_satellite = sonic_is_satellite(params->this_sonic, listClusterEl[cluster_id].chromosome_name, listClusterEl[cluster_id].posEndSV_Outer - 1, listClusterEl[cluster_id].posEndSV_Outer); + is_mid_satellite = sonic_is_satellite(params->this_sonic, listClusterEl[cluster_id].chromosome_name, (int)( listClusterEl[cluster_id].posStartSV_Outer + listClusterEl[cluster_id].posEndSV_Outer) / 2, + (int)( listClusterEl[cluster_id].posStartSV_Outer + listClusterEl[cluster_id].posEndSV_Outer) / 2 + 1); - mei_start = sonic_is_mobile_element( params->this_sonic, listClusterEl[cluster_id].chromosome_name, listClusterEl[cluster_id].posStartSV_Outer, listClusterEl[cluster_id].posStartSV_Outer + 1, params->mei); - mei_end = sonic_is_mobile_element( params->this_sonic, listClusterEl[cluster_id].chromosome_name, listClusterEl[cluster_id].posEndSV_Outer - 1, listClusterEl[cluster_id].posEndSV_Outer, params->mei); - mei_mid = sonic_is_mobile_element( params->this_sonic, listClusterEl[cluster_id].chromosome_name, (int)( listClusterEl[cluster_id].posStartSV_Outer + listClusterEl[cluster_id].posEndSV_Outer) / 2, - (int)( listClusterEl[cluster_id].posStartSV_Outer + listClusterEl[cluster_id].posEndSV_Outer) / 2 + 1, params->mei); + mei_start = sonic_is_mobile_element( params->this_sonic, listClusterEl[cluster_id].chromosome_name, listClusterEl[cluster_id].posStartSV_Outer, listClusterEl[cluster_id].posStartSV_Outer + 1, params->mei); + mei_end = sonic_is_mobile_element( params->this_sonic, listClusterEl[cluster_id].chromosome_name, listClusterEl[cluster_id].posEndSV_Outer - 1, listClusterEl[cluster_id].posEndSV_Outer, params->mei); + mei_mid = sonic_is_mobile_element( params->this_sonic, listClusterEl[cluster_id].chromosome_name, (int)( listClusterEl[cluster_id].posStartSV_Outer + listClusterEl[cluster_id].posEndSV_Outer) / 2, + (int)( listClusterEl[cluster_id].posStartSV_Outer + listClusterEl[cluster_id].posEndSV_Outer) / 2 + 1, params->mei); + mei_all = sonic_is_mobile_element( params->this_sonic, listClusterEl[cluster_id].chromosome_name, listClusterEl[cluster_id].posStartSV_Outer - 1, listClusterEl[cluster_id].posEndSV_Outer, params->mei); + } if( listClusterEl[cluster_id].SVtype == MEIFORWARD) { + strcpy( mobileName, listClusterEl[cluster_id].mobileName); + //if(mei_all != NULL) + //fprintf(stderr, "FORWARD: %s, %s - %s, %s, %d\n", mobileName, listClusterEl[cluster_id].mei_type, mei_all->repeat_type, mei_all->repeat_class, mei_all->strand); + + if( ( mei_all != NULL && strcmp( mei_all->repeat_class, listClusterEl[cluster_id].mei_type) == 0 && + mei_all->strand == SONIC_STRAND_FWD) || ( is_all_satellite != 0)) + MEI_Filter = true; if( ( mei_start != NULL && strcmp( mei_start->repeat_type, mobileName) == 0 && mei_start->strand == SONIC_STRAND_FWD) || ( is_start_satellite != 0)) @@ -760,7 +772,8 @@ void outputCluster( bam_info** in_bams, parameters* params, int cluster_id, FILE else if( listClusterEl[cluster_id].SVtype == MEIREVERSE) { strcpy( mobileName, listClusterEl[cluster_id].mobileName); - + //if(mei_all != NULL) + //fprintf(stderr, "REVERSE: %s, %s - %s, %s, %d\n", mobileName, listClusterEl[cluster_id].mei_type, mei_all->repeat_type, mei_all->repeat_class, mei_all->strand); if( ( mei_start != NULL && strcmp( mei_start->repeat_type, mobileName) == 0 && mei_start->strand == SONIC_STRAND_REV) || ( is_start_satellite != 0)) MEI_Filter = true; @@ -772,6 +785,10 @@ void outputCluster( bam_info** in_bams, parameters* params, int cluster_id, FILE if( ( mei_mid != NULL && strcmp( mei_mid->repeat_type, mobileName) == 0 && mei_mid->strand == SONIC_STRAND_REV) || ( is_mid_satellite != 0)) MEI_Filter = true; + + if( ( mei_all != NULL && strcmp( mei_all->repeat_class, listClusterEl[cluster_id].mei_type) == 0 && + mei_all->strand == SONIC_STRAND_REV) || ( is_all_satellite != 0)) + MEI_Filter = true; } if( listClusterEl[cluster_id].SVtype == MEIFORWARD || listClusterEl[cluster_id].SVtype == NUMTFORWARD) @@ -1299,6 +1316,15 @@ void processTheSV( bam_info **in_bams, parameters *params, int listClusterId, in tmp_cluster = tmp_cluster->next; } } + /*if(listClusterEl[listClusterId].SVtype == MEIREVERSE) + { + fprintf(stderr,"REVERSE: %d - %d - %d - %d\n", posStartSV_Outer, posStartSV, posEndSV, posEndSV_Outer); + } + else if(listClusterEl[listClusterId].SVtype == MEIFORWARD) + { + fprintf(stderr,"FORWARD: %d - %d - %d - %d\n", posStartSV_Outer, posStartSV, posEndSV, posEndSV_Outer); + }*/ + listClusterEl[listClusterId].posStartSV = posStartSV; listClusterEl[listClusterId].posStartSV_Outer = posStartSV_Outer; listClusterEl[listClusterId].posEndSV = posEndSV;