diff --git a/README.md b/README.md
index 70b453d..780d178 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,5 @@
-#EMBL-HLA-SUBMISSION
+#Bhast
+Ben's HLA Allele Submission Tool
A tool for generating an EMBL-formatted submission of a standard novel HLA allele.
##Download the executable
diff --git a/src/AlleleGenerator.py b/src/AlleleGenerator.py
index 410719b..88a1604 100755
--- a/src/AlleleGenerator.py
+++ b/src/AlleleGenerator.py
@@ -17,6 +17,7 @@
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna
+import sys
import tkMessageBox
import math
@@ -26,63 +27,110 @@
# The AlleleGenerator class contains logic to generate an EMBL HLA allele submission
# In ENA format.
class AlleleGenerator():
+
+ def __init__(self):
- inputFileName = ''
- outputFileName = ''
- sequenceAnnotation = HLAGene()
- inputCellNummer = 0#12345
- inputGene = ''#HLA-C'
- inputAllele = ''#C0316ext'
+ self.inputFileName = ''
+ self.outputFileName = ''
+ self.sequenceAnnotation = HLAGene()
+ self.inputCellNummer = 0
+ self.inputGene = ''
+ self.inputAllele = ''
# This is a short wrapper method to use biopython's translation method.
+ # Most of this code is just checking for things that went wrong
def translateSequence(self,inputSequence):
- coding_dna = Seq(inputSequence, generic_dna)
+ proteinSequence = ''
- peptideSequence = str(coding_dna.translate())
- print ('Translated Protein:' + peptideSequence)
-
- #Stop codon *should* be at the end of the protein.
- stopCodonLocation = peptideSequence.find('*')
-
- if (stopCodonLocation == -1):
- if(len(coding_dna) % 3 == 0):
- tkMessageBox.showinfo('No Stop Codon Found',
- 'The translated protein does not contain a stop codon.' )
+ try:
+ # Do nothing if the input sequence is blank.
+ if( len(inputSequence) > 0 ):
+
+ coding_dna = Seq(inputSequence, generic_dna)
+ proteinSequence = str(coding_dna.translate())
+ print ('Exon Sequence before translation:' + coding_dna)
+ print ('Translated Protein:' + proteinSequence)
+
+ # Perform Sanity Checks.
+ # Stop codon *should* be at the end of the protein.
+ # Here we seek out the first instance of a stop codon,
+ # and remove the peptides afterwards.
+ # because that's what happens in real life.
+ stopCodonLocation = proteinSequence.find('*')
+
+ # If no stop codon was found
+ if (stopCodonLocation == -1):
+ # If multiple of three (correct codon length)
+ if(len(coding_dna) % 3 == 0):
+ tkMessageBox.showinfo('No Stop Codon Found',
+ 'The translated protein does not contain a stop codon.' )
+
+ # Wrong Codon Length
+ else:
+ tkMessageBox.showinfo('No Stop Codon Found',
+ 'The translated protein does not contain a stop codon.\n' +
+ 'The coding nucleotide sequence length (' + str(len(coding_dna)) + ') is not a multiple of 3.')
+
+ # If Stop Codon is in the end of the protein (This is expected and correct)
+ elif (stopCodonLocation == len(proteinSequence) - 1):
+ # If multiple of three (correct codon length)
+ if(len(coding_dna) % 3 == 0):
+ # Everything is fine in this case. Trim off the stop codon
+ proteinSequence = proteinSequence[0:stopCodonLocation]
+ pass
+ # Wrong Codon Length
+ else:
+ tkMessageBox.showinfo('Extra Nucleotides After the Stop Codon',
+ 'The stop codon is at the correct position in the protein, but ' +
+ 'The coding nucleotide sequence length (' + str(len(coding_dna)) + ') is not a multiple of 3.\n\n' +
+ 'Please double check your sequence.')
+ proteinSequence = proteinSequence[0:stopCodonLocation]
+
+ # Else Stop Codon is premature (before the end of the protein)
+ else:
+ # If multiple of three (correct codon length)
+ if(len(coding_dna) % 3 == 0):
+ tkMessageBox.showinfo('Premature Stop Codon Detected',
+ 'Premature stop codon found:\nProtein Position (' +
+ str(stopCodonLocation + 1) + '/' +
+ str(len(proteinSequence)) + ')\n\n' +
+ 'Double check your protein sequence,\n' +
+ 'this might indicate a missense mutation.\n\n' +
+ 'Translated Protein:\n' + proteinSequence +
+ '\n\nProtein in EMBL Submission:\n' + proteinSequence[0:stopCodonLocation] +
+ '\n'
+ )
+ proteinSequence = proteinSequence[0:stopCodonLocation]
+
+
+ # Wrong Codon Length
+ else:
+ tkMessageBox.showinfo('Premature Stop Codon Detected',
+ 'Premature stop codon found:\nProtein Position (' +
+ str(stopCodonLocation + 1) + '/' +
+ str(len(proteinSequence)) + ')\n\n' +
+ 'Nucleotide count is not a multiple of 3,\n' +
+ 'Double check your protein sequence,\n' +
+ 'this might indicate a missense mutation.\n\n' +
+ 'Translated Protein:\n' + proteinSequence +
+ '\n\nProtein in EMBL Submission:\n' + proteinSequence[0:stopCodonLocation] +
+ '\n'
+ )
+ proteinSequence = proteinSequence[0:stopCodonLocation]
else:
- tkMessageBox.showinfo('No Stop Codon Found',
- 'The translated protein does not contain a stop codon.\n' +
- 'It looks like a frame shift,\n' +
- 'The coding nucleotide sequence has length: ' + str(len(coding_dna)) + '.')
- else:
- if (stopCodonLocation == len(peptideSequence) - 1):
- #Stop codon is the last character in the peptide sequence. That's just fine, but trim off the stop codon.
- peptideSequence = peptideSequence[0:stopCodonLocation]
+ print('Translating a nucleotide sequence of length 0. That was easy.')
pass
- else:
- tkMessageBox.showinfo('Premature Stop Codon Detected',
- 'Premature stop codon found:\nPeptide Position (' +
- str(stopCodonLocation + 1) + '/' +
- str(len(peptideSequence)) + ')\n\n' +
- 'Double check your peptide sequence,\n' +
- 'Some aminos from the 3\' / C-Terminus\nwere spliced out.\n\n' +
- 'Before : ' + peptideSequence +
- '\nAfter : ' + peptideSequence[0:stopCodonLocation] +
- '\n'
- )
- peptideSequence = peptideSequence[0:stopCodonLocation]
-
- return peptideSequence
- #
- def readInputSequence(self):
-
- print('Reading file: ' + self.inputFileName)
-
- fileObject = open(self.inputFileName, 'r')
- fullFile = fileObject.read()
-
- self.processInputSequence(fullFile)
+ return proteinSequence
+
+ except Exception:
+ print 'Problem when translating protein:'
+ print sys.exc_info()[1]
+ tkMessageBox.showinfo('Protein Translation Error',
+ 'I could not translate your protein:\n' + str(sys.exc_info()[1]))
+
+ raise
# The input file should be a string of nucleotides, with capital letters to identify exons and introns.
# Annotations are expected and read in this format:
@@ -92,270 +140,338 @@ def readInputSequence(self):
def processInputSequence(self, inputSequenceText):
resultGeneLoci = HLAGene()
- # Why do I need to initialize loci array? I would have thought calling Gene() would do it.
- resultGeneLoci.loci = []
-
+
# Trim out any spaces, tabs, newlines. Uppercase.
cleanedGene = inputSequenceText.replace(' ','').replace('\n','').replace('\t','').replace('\r','')
- # Trim out the annotation marks, and capitalize, so I have a copy of the full sequence.
+ # Capitalize, so I can store a copy of the full unannotated sequence.
unannotatedGene = cleanedGene.upper()
resultGeneLoci.fullSequence = unannotatedGene
print('Total Sequence Length = ' + str(len(unannotatedGene)))
# Loop through the cleaned and annotated input sequence,
- # to search for exon annotation characters ( '[' and ']' )
- # I assume that the first and last loci are the 5' and 3' UTR,
- # I assume that Exons and Introns will alternate beyond that.
- # It no longer uses ( '[' and ']' ) to specify exons. I check for
# capitals and lowercase letters to determine exon start and end
- insideAnExon = False
- locusBeginPosition = 0
- for x in range(0, len(cleanedGene)):
- currentChar = cleanedGene[x]
+ if(len(cleanedGene) > 0):
- # Is this a standard nucleotide character?
- if(currentChar.upper() in ('A','G','C','T')):
-
- if(currentChar.isupper()):
- if(insideAnExon):
- #We're STILL in an exon. In this case, I should just do nothing and continue.
- pass
+ # Is the first feature an exon or an intron?
+ # If we begin in an Exon
+ if( cleanedGene[0] in ('A','G','C','T')):
+ insideAnExon = True
+ # If we begin in an Intron/UTR
+ elif( cleanedGene[0] in ('a','g','c','t')):
+ insideAnExon = False
+ else:
+ # Nonstandard nucleotide? I should start panicking.
+ #raise Exception('Nonstandard Nucleotide, not sure how to handle it')
+ print('Nonstandard Nucleotide at the beginning of the sequence, not sure how to handle it')
+ insideAnExon = False
+
+
+ locusBeginPosition = 0
+ for x in range(0, len(cleanedGene)):
+ currentChar = cleanedGene[x]
+
+ # Is this a standard nucleotide character?
+ if(currentChar.upper() in ('A','G','C','T')):
+
+ if(currentChar.isupper()):
+ if(insideAnExon):
+ #We're STILL in an exon. In this case, I should just do nothing and continue.
+ pass
+ else:
+ #In this case, we're just starting an EXON.
+ #Store the last Intron in the list.
+ currentIntron = GeneLocus()
+ currentIntron.sequence = cleanedGene[locusBeginPosition:x].upper()
+ currentIntron.exon = False
+ resultGeneLoci.loci.append(currentIntron)
+ insideAnExon=True
+ locusBeginPosition = x
+ pass
+
else:
- #In this case, we're just starting an EXON.
- #Store the last Intron in the list.
- currentIntron = GeneLocus()
- currentIntron.sequence = cleanedGene[locusBeginPosition:x].upper()
- currentIntron.exon = False
- resultGeneLoci.loci.append(currentIntron)
- insideAnExon=True
- locusBeginPosition = x
- pass
-
+ if not (insideAnExon):
+ #We're STILL in an intron. Continue.
+ pass
+ else:
+ #Starting a new Intron.
+ # Store an Exon in the list.
+ currentExon = GeneLocus()
+ currentExon.sequence = cleanedGene[locusBeginPosition:x].upper()
+ currentExon.exon = True
+ resultGeneLoci.loci.append(currentExon)
+ insideAnExon = False
+ locusBeginPosition=x
+ pass
else:
- if not (insideAnExon):
- #We're STILL in an intron. Continue.
- pass
- else:
- # Store an Exon in the list.
- currentExon = GeneLocus()
- currentExon.sequence = cleanedGene[locusBeginPosition:x].upper()
- currentExon.exon = True
- resultGeneLoci.loci.append(currentExon)
- insideAnExon = False
- locusBeginPosition=x
-
- #Starting a new Intron.
- pass
- else:
- print('Nonstandard nucleotide detected at position ' + str(x) + ' : ' + currentChar
- + '. If this is a wildcard character, you might be ok.')
-
- # Store the last(3') UTR as an intron.
- currentIntron = GeneLocus()
- currentIntron.sequence = cleanedGene[locusBeginPosition:len(cleanedGene)].upper()
- currentIntron.exon = False
- resultGeneLoci.loci.append(currentIntron)
-
- # Annotate the loci (name them) and print the results of the read file.
- resultGeneLoci.annotateLoci()
- resultGeneLoci.printGeneSummary()
-
- self.sequenceAnnotation = resultGeneLoci
-
- # Create the text submission based on the ENA format.
- def buildENASubmission(self):
+ print('Nonstandard nucleotide detected at position ' + str(x) + ' : ' + currentChar
+ + '. If this is a wildcard character, you might be ok.')
+
+ # We've reached the end of the loop and we still need to store the last feature.
+ # Should be a 3' UTR, but I can't be sure, people like to put in weird sequences.
+ currentIntron = GeneLocus()
+ currentIntron.sequence = cleanedGene[locusBeginPosition:len(cleanedGene)].upper()
+ currentIntron.exon = insideAnExon
+ resultGeneLoci.loci.append(currentIntron)
+
+ # Annotate the loci (name them) and print the results of the read file.
+ resultGeneLoci.annotateLoci()
+ resultGeneLoci.printGeneSummary()
- # ENA format is the preferred submission type for EMBL. More information:
- # http://www.ebi.ac.uk/ena/submit/sequence-submission
- # http://www.ebi.ac.uk/ena/submit/entry-upload-templates
- # ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/usrman.txt
- # ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/FT_current.html
- # http://www.ebi.ac.uk/ena/software/flat-file-validator
-
- documentBuffer = ''
-
- # These variables are for test data, they should be filled in by GUI.
- #self.inputCellNummer = 23445
- #self.inputGene = 'HLA-C'
- #self.inputAllele = 'C0316ext'
+ # If the sequence is empty
+ else:
+ print('Empty sequence, I don\'t have anything to do.')
+
+ self.sequenceAnnotation = resultGeneLoci
- completeSequence = self.sequenceAnnotation.getCompleteSequence()
- exonSequence = self.sequenceAnnotation.getExonSequence()
- totalLength = self.sequenceAnnotation.totalLength()
- featureCount = len(self.sequenceAnnotation.loci)
- print('total calculated length = ' + str(totalLength))
+
+ def printHeader(self):
+ headerText = ''
+
# Print header
- documentBuffer += ('ID XXX; XXX; linear; genomic DNA; XXX; XXX; ' + str(totalLength) + ' BP.\n')
- documentBuffer += ('XX\n')
-
+ headerText += 'ID XXX; XXX; linear; genomic DNA; XXX; XXX; ' + str(self.sequenceAnnotation.totalLength()) + ' BP.\n'
+ headerText += 'XX\n'
# A valid document should have an AC (Accession Number) and DE (Description) field.
# I don't have an AC number available, so it's blank.
- documentBuffer += ('AC \n')
- documentBuffer += ('XX\n')
-
- documentBuffer += ('DE Human Leukocyte Antigen\n')
- documentBuffer += ('XX\n')
+ headerText += 'AC \n'
+ headerText += 'XX\n'
+ headerText += 'DE Human Leukocyte Antigen\n'
+ headerText += 'XX\n'
# Print key
- documentBuffer += ('FH Key Location/Qualifiers\n')
- documentBuffer += ('FH\n')
-
+ headerText += ('FH Key Location/Qualifiers\n')
+ headerText += ('FH\n')
+
# Print source
# It's from a human.
- documentBuffer += ('FT source 1..' + str(totalLength) + '\n')
- documentBuffer += ('FT /organism="Homo sapiens"\n')
- documentBuffer += ('FT /db_xref="taxon:9606"\n')
- documentBuffer += ('FT /mol_type="genomic DNA"\n')
- documentBuffer += ('FT /chromosome="6"\n')
- documentBuffer += ('FT /isolate="' + str(self.inputCellNummer) + '"\n')
-
+ headerText += ('FT source 1..' + str(self.sequenceAnnotation.totalLength()) + '\n')
+ headerText += ('FT /organism="Homo sapiens"\n')
+ headerText += ('FT /db_xref="taxon:9606"\n')
+ headerText += ('FT /mol_type="genomic DNA"\n')
+ headerText += ('FT /chromosome="6"\n')
+ headerText += ('FT /isolate="' + str(self.inputCellNummer) + '"\n')
+
+ return headerText
+
+ def printMRNA(self):
+ mRNAText = ''
# Print mRNA
- documentBuffer += ('FT mRNA join(')
+ mRNAText += ('FT mRNA join(')
+
# Iterate through the indices of the UTRs and exons.
# The 3' and 5' UTR are included in the mRNA
- # But not in the CDS (coding sequence), since they're untranslated.
- documentBuffer += (str(self.sequenceAnnotation.loci[0].beginIndex)
- + '..' + str(self.sequenceAnnotation.loci[0].endIndex) + ',')
-
- for x in range(1,featureCount-1):
+ for x in range(0,len(self.sequenceAnnotation.loci)):
geneLocus = self.sequenceAnnotation.loci[x]
- if (geneLocus.exon):
- documentBuffer += str(geneLocus.beginIndex) + '..' + str(geneLocus.endIndex) + ','
+ # If it is an exon or UTR
+ if (geneLocus.exon or 'UT' in geneLocus.name):
+ mRNAText += str(geneLocus.beginIndex) + '..' + str(geneLocus.endIndex) + ','
- documentBuffer += (str(self.sequenceAnnotation.loci[featureCount-1].beginIndex)
- + '..' + str(self.sequenceAnnotation.loci[featureCount-1].endIndex) + ')\n')
+ # Trim off the last comma and add a parenthese
+ mRNAText = mRNAText[0:len(mRNAText)-1] + ')\n'
- documentBuffer += ('FT /gene="' + str(self.inputGene) + '"\n')
- documentBuffer += ('FT /allele="' + str(self.inputAllele) + '"\n')
- documentBuffer += ('FT /product=\"MHC class I antigen\"\n')
-
+ mRNAText += ('FT /gene="' + str(self.inputGene) + '"\n')
+ mRNAText += ('FT /allele="' + str(self.inputAllele) + '"\n')
+ mRNAText += ('FT /product=\"MHC class I antigen\"\n')
+
+ return mRNAText
+
+
+ def printCDS(self):
+ cdsText = ''
+
# Print CDS
# CDS is the coding sequence. It should include the exons, but not the UTRs/Introns
# The range 1:featureCount-1 will exclude the UTRs.
- documentBuffer += ('FT CDS join(')
- for x in range(1,featureCount-1):
+ cdsText += ('FT CDS join(')
+ for x in range(0,len(self.sequenceAnnotation.loci)):
geneLocus = self.sequenceAnnotation.loci[x]
if (geneLocus.exon):
- documentBuffer += str(geneLocus.beginIndex) + '..' + str(geneLocus.endIndex)
- if not x==featureCount-2:
- documentBuffer += ','
+ cdsText += str(geneLocus.beginIndex) + '..' + str(geneLocus.endIndex)
+ if not x==len(self.sequenceAnnotation.loci)-2:
+ cdsText += ','
else:
- documentBuffer += ')\n'
+ cdsText += ')\n'
- documentBuffer += ('FT /transl_table=1\n')
- documentBuffer += ('FT /codon_start=1\n')
- documentBuffer += ('FT /gene="' + str(self.inputGene) + '"\n')
- documentBuffer += ('FT /allele="' + str(self.inputAllele) + '"\n')
- documentBuffer += ('FT /product=\"MHC class I antigen\"\n')
- documentBuffer += ('FT /translation=\"')
+ cdsText += ('FT /transl_table=1\n')
+ cdsText += ('FT /codon_start=1\n')
+ cdsText += ('FT /gene="' + str(self.inputGene) + '"\n')
+ cdsText += ('FT /allele="' + str(self.inputAllele) + '"\n')
+ cdsText += ('FT /product=\"MHC class I antigen\"\n')
+ cdsText += ('FT /translation=\"')
# Some simple formatting for the peptide sequence, making it human and computer readable.
# 80 peptides per line. Except the first line, which is 66.
# 66 is 80-14, where 14 is the length of { /translation=" }
- peptideSequence = self.translateSequence(exonSequence)
+ peptideSequence = self.translateSequence(self.sequenceAnnotation.getExonSequence())
if(len(peptideSequence) < 66):
- documentBuffer += (peptideSequence) + '\"\n'
+ cdsText += (peptideSequence) + '\"\n'
else:
- documentBuffer += peptideSequence[0:66] + '\n'
+ cdsText += peptideSequence[0:66] + '\n'
i=66
while (i < len(peptideSequence)):
- documentBuffer += 'FT ' + peptideSequence[i:i+80] + '\n'
+ cdsText += 'FT ' + peptideSequence[i:i+80] + '\n'
i += 80
-
- # Print 5'UTR
- utr = self.sequenceAnnotation.loci[0]
- documentBuffer += ('FT 5\'UTR ' + str(utr.beginIndex) + '..' + str(utr.endIndex) + '\n')
- documentBuffer += ('FT /note=\"5\'UTR\"\n')
- documentBuffer += ('FT /gene="' + str(self.inputGene) + '"\n')
- documentBuffer += ('FT /allele="' + str(self.inputAllele) + '"\n')
-
- # Print alternating Ex/Int/Ex
- for x in range(1,featureCount-1):
+
+ return cdsText
+
+ def printFeatures(self):
+ featureText = ''
+
+ exonIndex = 1
+ intronIndex = 1
+
+ geneHas3UTR = False
+ geneHas5UTR = False
+
+ for x in range(0,len(self.sequenceAnnotation.loci)):
currentFeature = self.sequenceAnnotation.loci[x]
- if(currentFeature.exon):
- documentBuffer += ('FT exon ' + str(currentFeature.beginIndex)
+ # 3' UTR
+ if(currentFeature.name == '3UT'):
+ featureText += ('FT 3\'UTR ' + str(currentFeature.beginIndex) + '..' + str(currentFeature.endIndex) + '\n')
+ featureText += ('FT /note=\"3\'UTR\"\n')
+ featureText += ('FT /gene="' + str(self.inputGene) + '"\n')
+ featureText += ('FT /allele="' + str(self.inputAllele) + '"\n')
+ geneHas3UTR = True
+
+ # 5' UTR
+ elif(currentFeature.name == '5UT'):
+ featureText += ('FT 5\'UTR ' + str(currentFeature.beginIndex) + '..' + str(currentFeature.endIndex) + '\n')
+ featureText += ('FT /note=\"5\'UTR\"\n')
+ featureText += ('FT /gene="' + str(self.inputGene) + '"\n')
+ featureText += ('FT /allele="' + str(self.inputAllele) + '"\n')
+ geneHas5UTR = True
+
+ # Exon
+ elif(currentFeature.exon):
+ featureText += ('FT exon ' + str(currentFeature.beginIndex)
+ '..' + str(currentFeature.endIndex) + '\n')
+ featureText += ('FT /number=' + str(exonIndex) + '\n')
+ featureText += ('FT /gene="' + str(self.inputGene) + '"\n')
+ featureText += ('FT /allele="' + str(self.inputAllele) + '"\n')
+ exonIndex += 1
+
+ # Intron
else:
- documentBuffer += ('FT intron ' + str(currentFeature.beginIndex)
- + '..' + str(currentFeature.endIndex) + '\n')
-
- geneNumber = int(math.ceil(x / 2.0))
- documentBuffer += ('FT /number=' + str(geneNumber) + '\n')
- documentBuffer += ('FT /gene="' + str(self.inputGene) + '"\n')
- documentBuffer += ('FT /allele="' + str(self.inputAllele) + '"\n')
+ featureText += ('FT intron ' + str(currentFeature.beginIndex)
+ + '..' + str(currentFeature.endIndex) + '\n')
+ featureText += ('FT /number=' + str(intronIndex) + '\n')
+ featureText += ('FT /gene="' + str(self.inputGene) + '"\n')
+ featureText += ('FT /allele="' + str(self.inputAllele) + '"\n')
+ intronIndex += 1
+
+
+ featureText += ('XX\n')
+
+ # Do a quick sanity check. If we are missing either UTR I should warn the user.
+ # But move on with your life, this is not worth getting upset over.
+ if (not geneHas3UTR and not geneHas5UTR):
+ tkMessageBox.showinfo('Missing UTRs',
+ 'This sequence has no 5\' or 3\' UTR.\n\n' +
+ 'Use lowercase nucleotides at the\n' +
+ 'beginning and end of your DNA\n' +
+ 'sequence to specify the 5\' and 3\' UTRs.' )
+ elif (not geneHas5UTR):
+ tkMessageBox.showinfo('Missing 5\' UTR',
+ 'This sequence has no 5\' UTR.\n\n' +
+ 'Use lowercase nucleotides at the\n' +
+ 'beginning and end of your DNA\n' +
+ 'sequence to specify the 5\' and 3\' UTRs.' )
+ elif (not geneHas3UTR):
+ tkMessageBox.showinfo('Missing 3\' UTR',
+ 'This sequence has no 3\' UTR.\n\n' +
+ 'Use lowercase nucleotides at the\n' +
+ 'beginning and end of your DNA\n' +
+ 'sequence to specify the 5\' and 3\' UTRs.' )
+ else:
+ print('The UTRs look fine.')
+ pass
+
+ return featureText
+
+ def printSequence(self):
+ sequenceText = ''
+
+ completeSequence = self.sequenceAnnotation.getCompleteSequence().upper()
+ cCount = completeSequence.count('C')
+ gCount = completeSequence.count('G')
+ tCount = completeSequence.count('T')
+ aCount = completeSequence.count('A')
+ otherCount = self.sequenceAnnotation.totalLength() - (cCount + gCount + tCount + aCount)
- # Print 3'UTR
- utr = self.sequenceAnnotation.loci[len(self.sequenceAnnotation.loci)-1]
- documentBuffer += ('FT 3\'UTR ' + str(utr.beginIndex) + '..' + str(utr.endIndex) + '\n')
- documentBuffer += ('FT /note=\"3\'UTR\"\n')
- documentBuffer += ('FT /gene="' + str(self.inputGene) + '"\n')
- documentBuffer += ('FT /allele="' + str(self.inputAllele) + '"\n')
- documentBuffer += ('XX\n')
-
- # Print sequence
- # There's a sweet biopython method which can count the nucleotides.
- # Bio.Seq.count('A')
- # I didn't use it.
- cCount = 0
- gCount = 0
- tCount = 0
- aCount = 0
- otherCount = 0
- for nucleotide in completeSequence:
- if nucleotide == 'C':
- cCount+=1
- elif nucleotide == 'G':
- gCount+=1
- elif nucleotide == 'T':
- tCount+=1
- elif nucleotide == 'A':
- aCount+=1
- else:
- otherCount+=1
-
- documentBuffer += ('SQ Sequence ' + str(totalLength) + ' BP; '
+ sequenceText += ('SQ Sequence ' + str(self.sequenceAnnotation.totalLength()) + ' BP; '
+ str(aCount) + ' A; ' + str(cCount) + ' C; '
+ str(gCount) + ' G; ' + str(tCount) + ' T; '
+ str(otherCount) + ' other;\n')
# Here's some logic to print the sequence information in groups of 10.
# This format is specified in the User manual specified by EMBL.
- rowCount = 0
- columnCount = 0
currentSeqIndex = 0
- while (currentSeqIndex < totalLength):
+ while (currentSeqIndex < self.sequenceAnnotation.totalLength()):
# The character code for a sequence region is two blank spaces,
# followed by three blank spaces, for a total of 5 blanks.
- documentBuffer += ' '
- sequenceRow = completeSequence[currentSeqIndex : currentSeqIndex + 60]
+ sequenceText += ' '
+ sequenceRow = self.sequenceAnnotation.getCompleteSequence()[currentSeqIndex : currentSeqIndex + 60]
# A sequenceChunk is 10 nucleotides in this context.
# Format specifies up to six "chunks" per line.
for i in range(0,6):
sequenceChunk = sequenceRow[i*10 : (i+1)*10]
- documentBuffer += sequenceChunk + ' '
+ sequenceText += sequenceChunk + ' '
# If line is complete (=60 bp), we can print the nucleotide index and move on to the next row.
if(len(sequenceRow) == 60):
- documentBuffer += str(currentSeqIndex + 60) + '\n'
+ sequenceText += str(currentSeqIndex + 60) + '\n'
# but if line is not complete (this is more likely, and more complicated.)
else:
# Fill with spaces to align the nucleotide indices at the end of the sequence.
numberSpaces = 60-len(sequenceRow)
for n in range (0, numberSpaces):
- documentBuffer += ' '
- documentBuffer += (str(len(sequenceRow) + currentSeqIndex) + '\n')
+ sequenceText += ' '
+ sequenceText += (str(len(sequenceRow) + currentSeqIndex) + '\n')
# The next row of the sequence
currentSeqIndex += 60
-
- # Print entry terminator. The last line of an ENA entry.
- documentBuffer += ('//\n')
+
+ return sequenceText
+
+
+ # Create the text submission based on the ENA format.
+ def buildENASubmission(self):
+
+ # ENA format is the preferred submission type for EMBL. More information:
+ # http://www.ebi.ac.uk/ena/submit/sequence-submission
+ # http://www.ebi.ac.uk/ena/submit/entry-upload-templates
+ # ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/usrman.txt
+ # ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/FT_current.html
+ # http://www.ebi.ac.uk/ena/software/flat-file-validator
+
+ documentBuffer = ''
+
+ totalLength = self.sequenceAnnotation.totalLength()
+ print('total calculated length = ' + str(totalLength))
+
+ if(totalLength > 0):
+
+ # These are the main sections of the ENA submission.
+ documentBuffer += self.printHeader()
+ documentBuffer += self.printMRNA()
+ documentBuffer += self.printCDS()
+ documentBuffer += self.printFeatures()
+ documentBuffer += self.printSequence()
+
+ # Print entry terminator. The last line of an ENA entry.
+ documentBuffer += ('//\n')
+
+ else:
+ tkMessageBox.showinfo('No HLA Sequence Found',
+ 'The HLA sequence is empty.\nPlease fill in an annotated HLA sequence\nbefore generating the submission.' )
+
+ pass
+
return documentBuffer
diff --git a/src/AlleleGui.py b/src/AlleleGui.py
index 7c94324..1a99985 100755
--- a/src/AlleleGui.py
+++ b/src/AlleleGui.py
@@ -14,7 +14,7 @@
# along with EMBL-HLA-Submission. If not, see .
# Version 1.0
-SoftwareVersion = "EMBL-HLA-Submission Version 1.0"
+SoftwareVersion = "Bhast Version 1.0"
import os
@@ -30,42 +30,42 @@ class AlleleGui(Tkinter.Frame):
# Initialize the GUI
def __init__(self, root):
Tkinter.Frame.__init__(self, root)
- root.title("EMBL Novel HLA Allele Submission Tool")
+ root.title("Bhast - A Novel HLA Allele Submission Generator")
self.parent = root
+
+ # Ctrl-A doesn't work by default in TK. I guess I need to do it myself.
+ root.bind_class("Text","", self.selectall)
+
self.initialize()
-
+
+ # I shouldn't need to write a select-All method but TK is kind of annoying.
+ def selectall(self, event):
+
+ event.widget.tag_add("sel","1.0","end")
+
# Initialize GUI elements
def initialize(self):
button_opt = {'fill': Tkconstants.BOTH, 'padx': 35, 'pady': 5}
-
self.cellNumInstrText = Tkinter.StringVar()
self.cellNumInstrText.set('Sample ID:')
self.inputCellNummer = Tkinter.StringVar()
- self.inputCellNummer.set('11111')
self.geneInstrText = Tkinter.StringVar()
self.geneInstrText.set('Gene:')
self.inputGene = Tkinter.StringVar()
- self.inputGene.set('HLA-C')
self.alleleInstrText = Tkinter.StringVar()
self.alleleInstrText.set('Allele:')
self.inputAllele = Tkinter.StringVar()
- self.inputAllele.set('C0213ext')
- #self.inputFeature = Tkinter.StringVar()
- #self.inputFeature.set('AGC[AGT]CCG[GGC]AAT')
self.featureInstrText = Tkinter.StringVar()
self.featureInstrText.set('Annotated Sequence:')
self.outputEMBLSubmission = Tkinter.StringVar()
self.outputEMBLSubmission.set('Resulting Allele Submission:')
- #Moving this to the bottom
- #Tkinter.Label(self, width=85, height=3, textvariable=self.instructionText).pack()
-
Tkinter.Label(self, width=80, height=1, textvariable=self.cellNumInstrText).pack()
Tkinter.Entry(self, width=15, textvariable=self.inputCellNummer).pack()
@@ -98,11 +98,7 @@ def initialize(self):
self.featureInputGuiObject.pack()
self.featureInputFrame.pack()
- self.featureInputGuiObject.delete('1.0','end')
- self.featureInputGuiObject.insert('1.0', 'aag\nCGTCGT\nccg\nGGCTGA\naat')
-
- #Tkinter.Button(self, text='\|/ Generate an EMBL submission \|/', command=self.updateGUI).pack(**button_opt)
- Tkinter.Button(self, text=unichr(8681) + ' Generate an EMBL submission ' + unichr(8681), command=self.updateGUI).pack(**button_opt)
+ Tkinter.Button(self, text=unichr(8681) + ' Generate an EMBL submission ' + unichr(8681), command=self.constructSubmission).pack(**button_opt)
Tkinter.Label(self, width=80, height=1, textvariable=self.outputEMBLSubmission).pack()
@@ -136,56 +132,61 @@ def initialize(self):
Tkinter.Button(self, text='Save this submission to my computer', command=self.saveSubmissionFile).pack(**button_opt)
self.instructionText = Tkinter.StringVar()
- #self.instructionText.set('This tool assumes you are submitting a standard HLA allele.\n'
- # + 'HLA alleles are assumed to be fully sequenced, including 5\' and 3\' UTRs.\n'
- # + 'Use capital letters for exons, lowercase for introns & UTRs, like this:\n'
- # + 'five\'utr EXON1 intron1 EXON2 ... EXON{X} three\'utr\n'
- # + 'All spaces, tabs, and newlines are discarded and ignored.')
self.instructionText.set('This tool was developed by the Tissue Typing Laboratory at\nMaastricht University Medical Center.\nFor more information:')
Tkinter.Label(self, width=85, height=3, textvariable=self.instructionText).pack()
-
-
+
# Make a frame for the more-info buttons
self.moreInfoFrame = Tkinter.Frame(self)
Tkinter.Button(self.moreInfoFrame, text='How to use this tool', command=self.howToUse).grid(row=0, column=0)
Tkinter.Button(self.moreInfoFrame, text='Contacting or Citing MUMC', command=self.contactInformation).grid(row=0, column=1)
+ Tkinter.Button(self.moreInfoFrame, text='Example Sequence', command=self.sampleSequence).grid(row=0, column=2)
-
-
self.moreInfoFrame.pack()
-
- self.updateGUI()
- def howToUse(self):
- # This method should popup some instruction text in a wee window.
+ def sampleSequence(self):
+ self.featureInputGuiObject.delete('1.0','end')
+ self.featureInputGuiObject.insert('1.0', 'aag\nCGTCGT\nccg\nGGCTGA\naat')
+
+ self.inputAllele.set('Allele:01:02')
+ self.inputGene.set('HLA-C')
+ self.inputCellNummer.set('Donor_12345')
- #self.instructionText.set('This tool assumes you are submitting a standard HLA allele.\n'
- # + 'HLA alleles are assumed to be fully sequenced, including 5\' and 3\' UTRs.\n'
- # + 'Use capital letters for exons, lowercase for introns & UTRs, like this:\n'
- # + 'five\'utr EXON1 intron1 EXON2 ... EXON{X} three\'utr\n'
- # + 'All spaces, tabs, and newlines are discarded and ignored.')
+ self.constructSubmission()
+ # This method should popup some instruction text in a wee window.
+ # This should be explicit on how to use the tool.
+ def howToUse(self):
tkMessageBox.showinfo('How to use this tool',
'This software is to be used to create an\n'
+ 'EMBL-formatted submission document,\n'
- + 'which specifies a novel HLA allele,\n'
- + 'including exon/intron annotation.\n\n'
+ + 'which specifies a (novel) HLA allele.\n\n'
- + 'This tool assumes you are submitting a\n'
- + 'full length HLA allele.\n'
- + 'HLA alleles should be fully sequenced,\n'
- + 'including 5\' and 3\' UTRs.\n'
+ + 'This tool requires you to submit a\n'
+ + 'full length HLA allele, including\n'
+ + '5\' and 3\' UTRs.\n\n'
+
+ 'Use capital letters for exons,\n'
+ 'lowercase for introns & UTRs.\n\n'
- + 'An example is included in the form,\n'
+ + 'Push the "Example Sequence" button to see a small example of'
+ + ' a formatted sequence.\n'
+ 'Sequences should follow this pattern:\n'
+ '5\'utr EX1 int1 EX2 ... EX{X} 3\'utr\n\n'
- + 'All spaces, tabs, and newlines are\n'
- + 'removed and ignored.'
+ + 'To use this tool:\n'
+ + '1.) Fill in a Sample ID, Gene Name, and Allele.'
+ + ' This text will be included in the submission.\n'
+ + '2.) Paste your formatted sequence in the\n'
+ + 'Annotated Sequence text area.\n'
+ + '3.) Push \"Generate an EMBL submission\" button'
+ + ' to generate a submission.\n'
+ + '4.) Push the "Save the submission" button'
+ + ' to store the submission on your computer.\nYou can submit this file to EMBL.\n\n'
+
+ + 'All spaces, tabs, and newlines are'
+ + ' removed before the nucleotide sequence is translated.'
)
def contactInformation(self):
@@ -221,12 +222,13 @@ def saveSubmissionFile(self):
options['initialdir'] = self.idir
options['parent'] = self
options['title'] = 'Specify your output file.'
+ options['initialfile'] = 'NovelAlleleEMBLSubmission.txt'
outputFileObject = tkFileDialog.asksaveasfile(**self.dir_opt)
submissionText = self.submOutputGuiObject.get('1.0', 'end')
outputFileObject.write(submissionText)
# Gather sequence information from the input elements, and generate a text EMBL submission.
- def updateGUI(self):
+ def constructSubmission(self):
allGen = AlleleGenerator()
roughFeatureSequence = self.featureInputGuiObject.get('1.0', 'end')
diff --git a/src/AlleleSubmissionEMBL.py b/src/AlleleSubmissionEMBL.py
index 1ffa0b5..6070ddb 100755
--- a/src/AlleleSubmissionEMBL.py
+++ b/src/AlleleSubmissionEMBL.py
@@ -15,7 +15,7 @@
# Version 1.0
-SoftwareVersion = "EMBL-HLA-Submission Version 1.0"
+SoftwareVersion = "Bhast Version 1.0"
import Tkinter
import sys
@@ -24,9 +24,9 @@
if __name__=='__main__':
try:
-
# This is a really simple way to read commandline args,
# because there really shouldn't be any.
+ # TODO: Be more graceful with this, there are better ways to read args.
# No parameters are expected at all. sys.argv[0] doesn't count.
if (len(sys.argv) == 1):
@@ -36,7 +36,7 @@
AlleleGui(root).pack()
root.mainloop()
- print('Done. Yay.')
+ print('Done. Hooray.')
# Print the Software Version
elif (len(sys.argv) == 2 and (
diff --git a/src/HLAGene.py b/src/HLAGene.py
index 9b2b7ca..69565e3 100755
--- a/src/HLAGene.py
+++ b/src/HLAGene.py
@@ -17,13 +17,14 @@
# The GeneLocus class specifies a locus on a Gene,
# Either an Exon, intron, or UTR.
-class GeneLocus():
-
- name = ''
- sequence = ''
- exon = False
- beginIndex = 0
- endIndex = 0
+class GeneLocus():
+
+ def __init__(self):
+ self.name = ''
+ self.sequence = ''
+ self.exon = False
+ self.beginIndex = 0
+ self.endIndex = 0
def length(self):
return 1 + self.endIndex - self.beginIndex
@@ -31,16 +32,15 @@ def length(self):
# The Gene class represents an entire HLA Gene, consisting of a series of loci.
class HLAGene():
- fullSequence = ''
- loci = []
+ def __init__(self):
+ self.fullSequence = ''
+ self.loci = []
def totalLength(self):
-
return len(self.getCompleteSequence())
# Combine the UTRs, Exons, and Introns into a contiguous sequence.
def getCompleteSequence(self):
-
sequence=''
for i in range(0, len(self.loci)):
sequence += self.loci[i].sequence
@@ -50,7 +50,7 @@ def getCompleteSequence(self):
def getExonSequence(self):
sequence=''
- for i in range(1, len(self.loci)-1):
+ for i in range(0, len(self.loci)):
if(self.loci[i].exon):
sequence += self.loci[i].sequence
return sequence
@@ -58,38 +58,50 @@ def getExonSequence(self):
# This method names the UTRs, Exons, and Introns, and records their indices.
# A HLA gene is always expected to have the pattern
# # 5UT -> EX1 -> IN1 -> EX2 -> IN2 -> ... -> EXN -> 3UT
+ # But I can't assume this so I will check.
def annotateLoci(self):
print('Annotating Gene Now')
+ # An index for the nucleotide position
lociBeginIndex = 1
- if(len(self.loci) > 2):
- for x in range(0, len(self.loci)):
-
- # Determine the name of this loci.
- # 5UT -> EX1 -> IN1 -> EX2 -> IN2 -> ... -> EXN -> 3UT
- if(x==0):
- self.loci[x].name = '5UT'
- elif(x==len(self.loci)-1):
- self.loci[x].name = '3UT'
- elif(x%2 == 1):
- self.loci[x].name = 'EX' + str(x/2 + 1)
- else:
- self.loci[x].name = 'I' + str(x/2)
+
+ # Start with the names 'EX1' and 'IN1'
+ exonIndex = 1
+ intronIndex = 1
+
+ lociCount = len(self.loci)
+
+ if(lociCount == 0):
+ print('No loci to annotate. We are done here.')
+ # TODO: do I need to raise an exception or just ignore the problem?
+
+ else:
+ for lociIndex in range(0, len(self.loci)):
+
+ if (self.loci[lociIndex].exon):
+ self.loci[lociIndex].name = 'EX' + str(exonIndex)
+ exonIndex += 1
+ else :
+ # Is it a UTR or an Intron?
+ if (lociIndex == 0):
+ self.loci[lociIndex].name = '5UT'
+ elif (lociIndex == lociCount - 1):
+ self.loci[lociIndex].name = '3UT'
+ else:
+ self.loci[lociIndex].name = 'I' + str(intronIndex)
+ intronIndex += 1
# Determine start and end indices of these exons.
# Attempting to make index that looks like:
#5UT: 1-65
#EX1: 66-137
# I1: 138-267
- self.loci[x].beginIndex = lociBeginIndex
- lociBeginIndex += len(self.loci[x].sequence)
- self.loci[x].endIndex = lociBeginIndex - 1
-
-
- else:
- print('I expected at least three loci in order to annotate them. Please double check your input file.')
-
+ self.loci[lociIndex].beginIndex = lociBeginIndex
+ lociBeginIndex += len(self.loci[lociIndex].sequence)
+ self.loci[lociIndex].endIndex = lociBeginIndex - 1
+
+
# Print a summary of the inputted sequence to console.
def printGeneSummary(self):
print('\nPrinting Gene Summary')