Skip to content

Commit

Permalink
Merge pull request #235 from susannasiebert/fusions
Browse files Browse the repository at this point in the history
Add subclasses for fusion processing
  • Loading branch information
susannasiebert authored Dec 7, 2016
2 parents 935788e + 5da68a4 commit 1b4961a
Show file tree
Hide file tree
Showing 65 changed files with 25,206 additions and 24,628 deletions.
66 changes: 60 additions & 6 deletions pvacseq/lib/fasta_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,12 @@ def determine_peptide_sequence_length(self, full_wildtype_sequence_length, pepti
def determine_flanking_sequence_length(self, full_wildtype_sequence_length, peptide_sequence_length, line):
actual_peptide_sequence_length = self.determine_peptide_sequence_length(full_wildtype_sequence_length, peptide_sequence_length, line)
if actual_peptide_sequence_length%2 == 0:
return (actual_peptide_sequence_length-2) / 2
return int((actual_peptide_sequence_length-2) / 2)
else:
return (actual_peptide_sequence_length-1) / 2
return int((actual_peptide_sequence_length-1) / 2)

def get_wildtype_subsequence(self, position, full_wildtype_sequence, wildtype_amino_acid_length, peptide_sequence_length, line):
one_flanking_sequence_length = int(self.determine_flanking_sequence_length(len(full_wildtype_sequence), peptide_sequence_length, line))
one_flanking_sequence_length = self.determine_flanking_sequence_length(len(full_wildtype_sequence), peptide_sequence_length, line)
peptide_sequence_length = 2 * one_flanking_sequence_length + wildtype_amino_acid_length

# We want to extract a subset from full_wildtype_sequence that is
Expand Down Expand Up @@ -78,9 +78,9 @@ def get_frameshift_subsequences(self, position, full_wildtype_sequence, peptide_
if position < one_flanking_sequence_length:
start_position = 0
else:
start_position = int(position - one_flanking_sequence_length)
wildtype_subsequence_stop_position = int(position + one_flanking_sequence_length)
mutation_subsequence_stop_position = int(position)
start_position = position - one_flanking_sequence_length
wildtype_subsequence_stop_position = position + one_flanking_sequence_length
mutation_subsequence_stop_position = position
wildtype_subsequence = full_wildtype_sequence[start_position:wildtype_subsequence_stop_position]
mutation_start_subsequence = full_wildtype_sequence[start_position:mutation_subsequence_stop_position]
return wildtype_subsequence, mutation_start_subsequence
Expand Down Expand Up @@ -157,3 +157,57 @@ def execute(self):
reader.close()
writer.close()
key_writer.close()

class FusionFastaGenerator(FastaGenerator):
def execute(self):
peptide_sequence_length = self.peptide_sequence_length
reader = open(self.input_file, 'r')
tsvin = csv.DictReader(reader, delimiter='\t')
fasta_sequences = OrderedDict()
for line in tsvin:
variant_type = line['variant_type']
position = int(line['fusion_position'])
sequence = line['fusion_amino_acid_sequence']
one_flanking_sequence_length = self.determine_flanking_sequence_length(len(sequence), peptide_sequence_length, line)
if position < one_flanking_sequence_length:
start_position = 0
else:
start_position = position - one_flanking_sequence_length

if variant_type == 'inframe_fusion':
stop_position = position + one_flanking_sequence_length
subsequence = sequence[start_position:stop_position]
elif variant_type == 'frameshift_fusion':
subsequence = sequence[start_position:]
if subsequence.endswith('X'):
subsequence = subsequence[:-1]
else:
continue

if '*' in subsequence:
continue

if 'X' in subsequence:
continue

if len(subsequence) < self.epitope_length:
continue

if subsequence in fasta_sequences:
fasta_sequences[subsequence].append(line['index'])
else:
fasta_sequences[subsequence] = [line['index']]

writer = open(self.output_file, 'w')
key_writer = open(self.output_key_file, 'w')
count = 1
for (subsequence, keys) in fasta_sequences.items():
writer.writelines('>%s\n' % count)
writer.writelines('%s\n' % subsequence)
yaml.dump({count: keys}, key_writer, default_flow_style=False)
count += 1

reader.close()
writer.close()
key_writer.close()

2 changes: 1 addition & 1 deletion pvacseq/lib/generate_protein_fasta.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def convert_vcf(input_file, temp_dir):
'trna_snvs_coverage_file' : None,
'trna_indels_coverage_file' : None,
}
converter = InputFileConverter(**convert_params)
converter = VcfConverter(**convert_params)
converter.execute()
print("Completed")

Expand Down
151 changes: 123 additions & 28 deletions pvacseq/lib/input_file_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,44 @@
import sys
import re
from abc import ABCMeta
from collections import OrderedDict

class InputFileConverter(metaclass=ABCMeta):
def __init__(self, **kwargs):
self.input_file = kwargs['input_file']
self.output_file = kwargs['output_file']
self.input_file = kwargs['input_file']
self.output_file = kwargs['output_file']

def output_headers(self):
return[
'chromosome_name',
'start',
'stop',
'reference',
'variant',
'gene_name',
'transcript_name',
'amino_acid_change',
'ensembl_gene_id',
'wildtype_amino_acid_sequence',
'downstream_amino_acid_sequence',
'fusion_amino_acid_sequence',
'variant_type',
'protein_position',
'fusion_position',
'transcript_expression',
'gene_expression',
'normal_depth',
'normal_vaf',
'tdna_depth',
'tdna_vaf',
'trna_depth',
'trna_vaf',
'index'
]

class VcfConverter(InputFileConverter):
def __init__(self, **kwargs):
InputFileConverter.__init__(self, **kwargs)
self.gene_expn_file = kwargs['gene_expn_file']
self.transcript_expn_file = kwargs['transcript_expn_file']
self.normal_snvs_coverage_file = kwargs['normal_snvs_coverage_file']
Expand Down Expand Up @@ -121,32 +154,6 @@ def calculate_coverage(self, ref, var):
def calculate_vaf(self, ref, var):
return (var / (self.calculate_coverage(ref, var)+0.00001)) * 100

def output_headers(self):
return[
'chromosome_name',
'start',
'stop',
'reference',
'variant',
'gene_name',
'transcript_name',
'amino_acid_change',
'ensembl_gene_id',
'wildtype_amino_acid_sequence',
'downstream_amino_acid_sequence',
'variant_type',
'protein_position',
'transcript_expression',
'gene_expression',
'normal_depth',
'normal_vaf',
'tdna_depth',
'tdna_vaf',
'trna_depth',
'trna_vaf',
'index'
]

def execute(self):
gene_expns = {}
if self.gene_expn_file is not None:
Expand Down Expand Up @@ -297,3 +304,91 @@ def execute(self):

writer.close()
reader.close()

class IntegrateConverter(InputFileConverter):
def input_fieldnames(self):
return [
'chr 5p',
'start 5p',
'end 5p',
'chr 3p',
'start 3p',
'end 3p',
'name of fusion',
'tier of fusion',
'strand 5p',
'strand 3p',
'quantitation',
'is canonical boundary',
'can be in-frame',
'peptides',
'fusion positions',
'number of nucleotides in the break',
'transcripts',
'is canonical intron dinucleotide',
]

def fusions_for_three_p_transcripts(self, five_p_transcript, three_p_transcripts):
fusions = []
if len(three_p_transcripts):
for three_p_transcript in three_p_transcripts.split('|'):
fusions.append("%s-%s"% (five_p_transcript, three_p_transcript))
return fusions

def execute(self):
reader = open(self.input_file, 'r')
csv_reader = csv.DictReader(reader, delimiter='\t', fieldnames=self.input_fieldnames())
writer = open(self.output_file, 'w')
tsv_writer = csv.DictWriter(writer, delimiter='\t', fieldnames=self.output_headers())
tsv_writer.writeheader()
transcript_count = {}
for entry in csv_reader:
output_row = {
'chromosome_name' : entry['chr 5p'],
'start' : entry['start 5p'],
'stop' : entry['end 5p'],
'reference' : 'fusion',
'variant' : 'fusion',
'gene_name' : entry['name of fusion'],
'amino_acid_change' : 'NA',
'ensembl_gene_id' : 'NA',
'protein_position' : 'NA',
'amino_acid_change' : 'NA',
'transcript_expression' : 'NA',
'gene_expression' : 'NA',
'normal_depth' : 'NA',
'normal_vaf' : 'NA',
'tdna_depth' : 'NA',
'tdna_vaf' : 'NA',
'trna_depth' : 'NA',
'trna_vaf' : 'NA',
}

count = 1
for (fusion_position, transcript_set, fusion_amino_acid_sequence) in zip(entry['fusion positions'].split(','), entry['transcripts'].split(','), entry['peptides'].split(',')):
(five_p_transcripts, three_p_inframe_transcripts, three_p_frameshift_transcripts) = transcript_set.split(';')
inframe_fusions = []
frameshift_fusions = []
for five_p_transcript in five_p_transcripts.split('|'):
inframe_fusions.extend(self.fusions_for_three_p_transcripts(five_p_transcript, three_p_inframe_transcripts))
frameshift_fusions.extend(self.fusions_for_three_p_transcripts(five_p_transcript, three_p_frameshift_transcripts))

if len(inframe_fusions):
output_row['variant_type'] = 'inframe_fusion'
output_row['fusion_position'] = fusion_position
output_row['fusion_amino_acid_sequence'] = fusion_amino_acid_sequence
output_row['transcript_name'] = ';'.join(inframe_fusions)
output_row['index'] = '%s_%s.%s.%s' % (entry['name of fusion'], count, 'inframe_fusion', fusion_position)
tsv_writer.writerow(output_row)
if len(frameshift_fusions):
output_row['variant_type'] = 'frameshift_fusion'
output_row['fusion_position'] = fusion_position
output_row['fusion_amino_acid_sequence'] = fusion_amino_acid_sequence
output_row['transcript_name'] = ';'.join(frameshift_fusions)
output_row['index'] = '%s_%s.%s.%s' % (entry['name of fusion'], count, 'frameshift_fusion', fusion_position)
tsv_writer.writerow(output_row)

count += 1

writer.close()
reader.close()
Loading

0 comments on commit 1b4961a

Please sign in to comment.