diff --git a/README.md b/README.md index 70b453d..780d178 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,5 @@ -#EMBL-HLA-SUBMISSION +#Bhast +Ben's HLA Allele Submission Tool A tool for generating an EMBL-formatted submission of a standard novel HLA allele. ##Download the executable diff --git a/src/AlleleGenerator.py b/src/AlleleGenerator.py index 410719b..88a1604 100755 --- a/src/AlleleGenerator.py +++ b/src/AlleleGenerator.py @@ -17,6 +17,7 @@ from Bio.Seq import Seq from Bio.Alphabet import generic_dna +import sys import tkMessageBox import math @@ -26,63 +27,110 @@ # The AlleleGenerator class contains logic to generate an EMBL HLA allele submission # In ENA format. class AlleleGenerator(): + + def __init__(self): - inputFileName = '' - outputFileName = '' - sequenceAnnotation = HLAGene() - inputCellNummer = 0#12345 - inputGene = ''#HLA-C' - inputAllele = ''#C0316ext' + self.inputFileName = '' + self.outputFileName = '' + self.sequenceAnnotation = HLAGene() + self.inputCellNummer = 0 + self.inputGene = '' + self.inputAllele = '' # This is a short wrapper method to use biopython's translation method. + # Most of this code is just checking for things that went wrong def translateSequence(self,inputSequence): - coding_dna = Seq(inputSequence, generic_dna) + proteinSequence = '' - peptideSequence = str(coding_dna.translate()) - print ('Translated Protein:' + peptideSequence) - - #Stop codon *should* be at the end of the protein. - stopCodonLocation = peptideSequence.find('*') - - if (stopCodonLocation == -1): - if(len(coding_dna) % 3 == 0): - tkMessageBox.showinfo('No Stop Codon Found', - 'The translated protein does not contain a stop codon.' ) + try: + # Do nothing if the input sequence is blank. + if( len(inputSequence) > 0 ): + + coding_dna = Seq(inputSequence, generic_dna) + proteinSequence = str(coding_dna.translate()) + print ('Exon Sequence before translation:' + coding_dna) + print ('Translated Protein:' + proteinSequence) + + # Perform Sanity Checks. + # Stop codon *should* be at the end of the protein. + # Here we seek out the first instance of a stop codon, + # and remove the peptides afterwards. + # because that's what happens in real life. + stopCodonLocation = proteinSequence.find('*') + + # If no stop codon was found + if (stopCodonLocation == -1): + # If multiple of three (correct codon length) + if(len(coding_dna) % 3 == 0): + tkMessageBox.showinfo('No Stop Codon Found', + 'The translated protein does not contain a stop codon.' ) + + # Wrong Codon Length + else: + tkMessageBox.showinfo('No Stop Codon Found', + 'The translated protein does not contain a stop codon.\n' + + 'The coding nucleotide sequence length (' + str(len(coding_dna)) + ') is not a multiple of 3.') + + # If Stop Codon is in the end of the protein (This is expected and correct) + elif (stopCodonLocation == len(proteinSequence) - 1): + # If multiple of three (correct codon length) + if(len(coding_dna) % 3 == 0): + # Everything is fine in this case. Trim off the stop codon + proteinSequence = proteinSequence[0:stopCodonLocation] + pass + # Wrong Codon Length + else: + tkMessageBox.showinfo('Extra Nucleotides After the Stop Codon', + 'The stop codon is at the correct position in the protein, but ' + + 'The coding nucleotide sequence length (' + str(len(coding_dna)) + ') is not a multiple of 3.\n\n' + + 'Please double check your sequence.') + proteinSequence = proteinSequence[0:stopCodonLocation] + + # Else Stop Codon is premature (before the end of the protein) + else: + # If multiple of three (correct codon length) + if(len(coding_dna) % 3 == 0): + tkMessageBox.showinfo('Premature Stop Codon Detected', + 'Premature stop codon found:\nProtein Position (' + + str(stopCodonLocation + 1) + '/' + + str(len(proteinSequence)) + ')\n\n' + + 'Double check your protein sequence,\n' + + 'this might indicate a missense mutation.\n\n' + + 'Translated Protein:\n' + proteinSequence + + '\n\nProtein in EMBL Submission:\n' + proteinSequence[0:stopCodonLocation] + + '\n' + ) + proteinSequence = proteinSequence[0:stopCodonLocation] + + + # Wrong Codon Length + else: + tkMessageBox.showinfo('Premature Stop Codon Detected', + 'Premature stop codon found:\nProtein Position (' + + str(stopCodonLocation + 1) + '/' + + str(len(proteinSequence)) + ')\n\n' + + 'Nucleotide count is not a multiple of 3,\n' + + 'Double check your protein sequence,\n' + + 'this might indicate a missense mutation.\n\n' + + 'Translated Protein:\n' + proteinSequence + + '\n\nProtein in EMBL Submission:\n' + proteinSequence[0:stopCodonLocation] + + '\n' + ) + proteinSequence = proteinSequence[0:stopCodonLocation] else: - tkMessageBox.showinfo('No Stop Codon Found', - 'The translated protein does not contain a stop codon.\n' + - 'It looks like a frame shift,\n' + - 'The coding nucleotide sequence has length: ' + str(len(coding_dna)) + '.') - else: - if (stopCodonLocation == len(peptideSequence) - 1): - #Stop codon is the last character in the peptide sequence. That's just fine, but trim off the stop codon. - peptideSequence = peptideSequence[0:stopCodonLocation] + print('Translating a nucleotide sequence of length 0. That was easy.') pass - else: - tkMessageBox.showinfo('Premature Stop Codon Detected', - 'Premature stop codon found:\nPeptide Position (' + - str(stopCodonLocation + 1) + '/' + - str(len(peptideSequence)) + ')\n\n' + - 'Double check your peptide sequence,\n' + - 'Some aminos from the 3\' / C-Terminus\nwere spliced out.\n\n' + - 'Before : ' + peptideSequence + - '\nAfter : ' + peptideSequence[0:stopCodonLocation] + - '\n' - ) - peptideSequence = peptideSequence[0:stopCodonLocation] - - return peptideSequence - # - def readInputSequence(self): - - print('Reading file: ' + self.inputFileName) - - fileObject = open(self.inputFileName, 'r') - fullFile = fileObject.read() - - self.processInputSequence(fullFile) + return proteinSequence + + except Exception: + print 'Problem when translating protein:' + print sys.exc_info()[1] + tkMessageBox.showinfo('Protein Translation Error', + 'I could not translate your protein:\n' + str(sys.exc_info()[1])) + + raise # The input file should be a string of nucleotides, with capital letters to identify exons and introns. # Annotations are expected and read in this format: @@ -92,270 +140,338 @@ def readInputSequence(self): def processInputSequence(self, inputSequenceText): resultGeneLoci = HLAGene() - # Why do I need to initialize loci array? I would have thought calling Gene() would do it. - resultGeneLoci.loci = [] - + # Trim out any spaces, tabs, newlines. Uppercase. cleanedGene = inputSequenceText.replace(' ','').replace('\n','').replace('\t','').replace('\r','') - # Trim out the annotation marks, and capitalize, so I have a copy of the full sequence. + # Capitalize, so I can store a copy of the full unannotated sequence. unannotatedGene = cleanedGene.upper() resultGeneLoci.fullSequence = unannotatedGene print('Total Sequence Length = ' + str(len(unannotatedGene))) # Loop through the cleaned and annotated input sequence, - # to search for exon annotation characters ( '[' and ']' ) - # I assume that the first and last loci are the 5' and 3' UTR, - # I assume that Exons and Introns will alternate beyond that. - # It no longer uses ( '[' and ']' ) to specify exons. I check for # capitals and lowercase letters to determine exon start and end - insideAnExon = False - locusBeginPosition = 0 - for x in range(0, len(cleanedGene)): - currentChar = cleanedGene[x] + if(len(cleanedGene) > 0): - # Is this a standard nucleotide character? - if(currentChar.upper() in ('A','G','C','T')): - - if(currentChar.isupper()): - if(insideAnExon): - #We're STILL in an exon. In this case, I should just do nothing and continue. - pass + # Is the first feature an exon or an intron? + # If we begin in an Exon + if( cleanedGene[0] in ('A','G','C','T')): + insideAnExon = True + # If we begin in an Intron/UTR + elif( cleanedGene[0] in ('a','g','c','t')): + insideAnExon = False + else: + # Nonstandard nucleotide? I should start panicking. + #raise Exception('Nonstandard Nucleotide, not sure how to handle it') + print('Nonstandard Nucleotide at the beginning of the sequence, not sure how to handle it') + insideAnExon = False + + + locusBeginPosition = 0 + for x in range(0, len(cleanedGene)): + currentChar = cleanedGene[x] + + # Is this a standard nucleotide character? + if(currentChar.upper() in ('A','G','C','T')): + + if(currentChar.isupper()): + if(insideAnExon): + #We're STILL in an exon. In this case, I should just do nothing and continue. + pass + else: + #In this case, we're just starting an EXON. + #Store the last Intron in the list. + currentIntron = GeneLocus() + currentIntron.sequence = cleanedGene[locusBeginPosition:x].upper() + currentIntron.exon = False + resultGeneLoci.loci.append(currentIntron) + insideAnExon=True + locusBeginPosition = x + pass + else: - #In this case, we're just starting an EXON. - #Store the last Intron in the list. - currentIntron = GeneLocus() - currentIntron.sequence = cleanedGene[locusBeginPosition:x].upper() - currentIntron.exon = False - resultGeneLoci.loci.append(currentIntron) - insideAnExon=True - locusBeginPosition = x - pass - + if not (insideAnExon): + #We're STILL in an intron. Continue. + pass + else: + #Starting a new Intron. + # Store an Exon in the list. + currentExon = GeneLocus() + currentExon.sequence = cleanedGene[locusBeginPosition:x].upper() + currentExon.exon = True + resultGeneLoci.loci.append(currentExon) + insideAnExon = False + locusBeginPosition=x + pass else: - if not (insideAnExon): - #We're STILL in an intron. Continue. - pass - else: - # Store an Exon in the list. - currentExon = GeneLocus() - currentExon.sequence = cleanedGene[locusBeginPosition:x].upper() - currentExon.exon = True - resultGeneLoci.loci.append(currentExon) - insideAnExon = False - locusBeginPosition=x - - #Starting a new Intron. - pass - else: - print('Nonstandard nucleotide detected at position ' + str(x) + ' : ' + currentChar - + '. If this is a wildcard character, you might be ok.') - - # Store the last(3') UTR as an intron. - currentIntron = GeneLocus() - currentIntron.sequence = cleanedGene[locusBeginPosition:len(cleanedGene)].upper() - currentIntron.exon = False - resultGeneLoci.loci.append(currentIntron) - - # Annotate the loci (name them) and print the results of the read file. - resultGeneLoci.annotateLoci() - resultGeneLoci.printGeneSummary() - - self.sequenceAnnotation = resultGeneLoci - - # Create the text submission based on the ENA format. - def buildENASubmission(self): + print('Nonstandard nucleotide detected at position ' + str(x) + ' : ' + currentChar + + '. If this is a wildcard character, you might be ok.') + + # We've reached the end of the loop and we still need to store the last feature. + # Should be a 3' UTR, but I can't be sure, people like to put in weird sequences. + currentIntron = GeneLocus() + currentIntron.sequence = cleanedGene[locusBeginPosition:len(cleanedGene)].upper() + currentIntron.exon = insideAnExon + resultGeneLoci.loci.append(currentIntron) + + # Annotate the loci (name them) and print the results of the read file. + resultGeneLoci.annotateLoci() + resultGeneLoci.printGeneSummary() - # ENA format is the preferred submission type for EMBL. More information: - # http://www.ebi.ac.uk/ena/submit/sequence-submission - # http://www.ebi.ac.uk/ena/submit/entry-upload-templates - # ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/usrman.txt - # ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/FT_current.html - # http://www.ebi.ac.uk/ena/software/flat-file-validator - - documentBuffer = '' - - # These variables are for test data, they should be filled in by GUI. - #self.inputCellNummer = 23445 - #self.inputGene = 'HLA-C' - #self.inputAllele = 'C0316ext' + # If the sequence is empty + else: + print('Empty sequence, I don\'t have anything to do.') + + self.sequenceAnnotation = resultGeneLoci - completeSequence = self.sequenceAnnotation.getCompleteSequence() - exonSequence = self.sequenceAnnotation.getExonSequence() - totalLength = self.sequenceAnnotation.totalLength() - featureCount = len(self.sequenceAnnotation.loci) - print('total calculated length = ' + str(totalLength)) + + def printHeader(self): + headerText = '' + # Print header - documentBuffer += ('ID XXX; XXX; linear; genomic DNA; XXX; XXX; ' + str(totalLength) + ' BP.\n') - documentBuffer += ('XX\n') - + headerText += 'ID XXX; XXX; linear; genomic DNA; XXX; XXX; ' + str(self.sequenceAnnotation.totalLength()) + ' BP.\n' + headerText += 'XX\n' # A valid document should have an AC (Accession Number) and DE (Description) field. # I don't have an AC number available, so it's blank. - documentBuffer += ('AC \n') - documentBuffer += ('XX\n') - - documentBuffer += ('DE Human Leukocyte Antigen\n') - documentBuffer += ('XX\n') + headerText += 'AC \n' + headerText += 'XX\n' + headerText += 'DE Human Leukocyte Antigen\n' + headerText += 'XX\n' # Print key - documentBuffer += ('FH Key Location/Qualifiers\n') - documentBuffer += ('FH\n') - + headerText += ('FH Key Location/Qualifiers\n') + headerText += ('FH\n') + # Print source # It's from a human. - documentBuffer += ('FT source 1..' + str(totalLength) + '\n') - documentBuffer += ('FT /organism="Homo sapiens"\n') - documentBuffer += ('FT /db_xref="taxon:9606"\n') - documentBuffer += ('FT /mol_type="genomic DNA"\n') - documentBuffer += ('FT /chromosome="6"\n') - documentBuffer += ('FT /isolate="' + str(self.inputCellNummer) + '"\n') - + headerText += ('FT source 1..' + str(self.sequenceAnnotation.totalLength()) + '\n') + headerText += ('FT /organism="Homo sapiens"\n') + headerText += ('FT /db_xref="taxon:9606"\n') + headerText += ('FT /mol_type="genomic DNA"\n') + headerText += ('FT /chromosome="6"\n') + headerText += ('FT /isolate="' + str(self.inputCellNummer) + '"\n') + + return headerText + + def printMRNA(self): + mRNAText = '' # Print mRNA - documentBuffer += ('FT mRNA join(') + mRNAText += ('FT mRNA join(') + # Iterate through the indices of the UTRs and exons. # The 3' and 5' UTR are included in the mRNA - # But not in the CDS (coding sequence), since they're untranslated. - documentBuffer += (str(self.sequenceAnnotation.loci[0].beginIndex) - + '..' + str(self.sequenceAnnotation.loci[0].endIndex) + ',') - - for x in range(1,featureCount-1): + for x in range(0,len(self.sequenceAnnotation.loci)): geneLocus = self.sequenceAnnotation.loci[x] - if (geneLocus.exon): - documentBuffer += str(geneLocus.beginIndex) + '..' + str(geneLocus.endIndex) + ',' + # If it is an exon or UTR + if (geneLocus.exon or 'UT' in geneLocus.name): + mRNAText += str(geneLocus.beginIndex) + '..' + str(geneLocus.endIndex) + ',' - documentBuffer += (str(self.sequenceAnnotation.loci[featureCount-1].beginIndex) - + '..' + str(self.sequenceAnnotation.loci[featureCount-1].endIndex) + ')\n') + # Trim off the last comma and add a parenthese + mRNAText = mRNAText[0:len(mRNAText)-1] + ')\n' - documentBuffer += ('FT /gene="' + str(self.inputGene) + '"\n') - documentBuffer += ('FT /allele="' + str(self.inputAllele) + '"\n') - documentBuffer += ('FT /product=\"MHC class I antigen\"\n') - + mRNAText += ('FT /gene="' + str(self.inputGene) + '"\n') + mRNAText += ('FT /allele="' + str(self.inputAllele) + '"\n') + mRNAText += ('FT /product=\"MHC class I antigen\"\n') + + return mRNAText + + + def printCDS(self): + cdsText = '' + # Print CDS # CDS is the coding sequence. It should include the exons, but not the UTRs/Introns # The range 1:featureCount-1 will exclude the UTRs. - documentBuffer += ('FT CDS join(') - for x in range(1,featureCount-1): + cdsText += ('FT CDS join(') + for x in range(0,len(self.sequenceAnnotation.loci)): geneLocus = self.sequenceAnnotation.loci[x] if (geneLocus.exon): - documentBuffer += str(geneLocus.beginIndex) + '..' + str(geneLocus.endIndex) - if not x==featureCount-2: - documentBuffer += ',' + cdsText += str(geneLocus.beginIndex) + '..' + str(geneLocus.endIndex) + if not x==len(self.sequenceAnnotation.loci)-2: + cdsText += ',' else: - documentBuffer += ')\n' + cdsText += ')\n' - documentBuffer += ('FT /transl_table=1\n') - documentBuffer += ('FT /codon_start=1\n') - documentBuffer += ('FT /gene="' + str(self.inputGene) + '"\n') - documentBuffer += ('FT /allele="' + str(self.inputAllele) + '"\n') - documentBuffer += ('FT /product=\"MHC class I antigen\"\n') - documentBuffer += ('FT /translation=\"') + cdsText += ('FT /transl_table=1\n') + cdsText += ('FT /codon_start=1\n') + cdsText += ('FT /gene="' + str(self.inputGene) + '"\n') + cdsText += ('FT /allele="' + str(self.inputAllele) + '"\n') + cdsText += ('FT /product=\"MHC class I antigen\"\n') + cdsText += ('FT /translation=\"') # Some simple formatting for the peptide sequence, making it human and computer readable. # 80 peptides per line. Except the first line, which is 66. # 66 is 80-14, where 14 is the length of { /translation=" } - peptideSequence = self.translateSequence(exonSequence) + peptideSequence = self.translateSequence(self.sequenceAnnotation.getExonSequence()) if(len(peptideSequence) < 66): - documentBuffer += (peptideSequence) + '\"\n' + cdsText += (peptideSequence) + '\"\n' else: - documentBuffer += peptideSequence[0:66] + '\n' + cdsText += peptideSequence[0:66] + '\n' i=66 while (i < len(peptideSequence)): - documentBuffer += 'FT ' + peptideSequence[i:i+80] + '\n' + cdsText += 'FT ' + peptideSequence[i:i+80] + '\n' i += 80 - - # Print 5'UTR - utr = self.sequenceAnnotation.loci[0] - documentBuffer += ('FT 5\'UTR ' + str(utr.beginIndex) + '..' + str(utr.endIndex) + '\n') - documentBuffer += ('FT /note=\"5\'UTR\"\n') - documentBuffer += ('FT /gene="' + str(self.inputGene) + '"\n') - documentBuffer += ('FT /allele="' + str(self.inputAllele) + '"\n') - - # Print alternating Ex/Int/Ex - for x in range(1,featureCount-1): + + return cdsText + + def printFeatures(self): + featureText = '' + + exonIndex = 1 + intronIndex = 1 + + geneHas3UTR = False + geneHas5UTR = False + + for x in range(0,len(self.sequenceAnnotation.loci)): currentFeature = self.sequenceAnnotation.loci[x] - if(currentFeature.exon): - documentBuffer += ('FT exon ' + str(currentFeature.beginIndex) + # 3' UTR + if(currentFeature.name == '3UT'): + featureText += ('FT 3\'UTR ' + str(currentFeature.beginIndex) + '..' + str(currentFeature.endIndex) + '\n') + featureText += ('FT /note=\"3\'UTR\"\n') + featureText += ('FT /gene="' + str(self.inputGene) + '"\n') + featureText += ('FT /allele="' + str(self.inputAllele) + '"\n') + geneHas3UTR = True + + # 5' UTR + elif(currentFeature.name == '5UT'): + featureText += ('FT 5\'UTR ' + str(currentFeature.beginIndex) + '..' + str(currentFeature.endIndex) + '\n') + featureText += ('FT /note=\"5\'UTR\"\n') + featureText += ('FT /gene="' + str(self.inputGene) + '"\n') + featureText += ('FT /allele="' + str(self.inputAllele) + '"\n') + geneHas5UTR = True + + # Exon + elif(currentFeature.exon): + featureText += ('FT exon ' + str(currentFeature.beginIndex) + '..' + str(currentFeature.endIndex) + '\n') + featureText += ('FT /number=' + str(exonIndex) + '\n') + featureText += ('FT /gene="' + str(self.inputGene) + '"\n') + featureText += ('FT /allele="' + str(self.inputAllele) + '"\n') + exonIndex += 1 + + # Intron else: - documentBuffer += ('FT intron ' + str(currentFeature.beginIndex) - + '..' + str(currentFeature.endIndex) + '\n') - - geneNumber = int(math.ceil(x / 2.0)) - documentBuffer += ('FT /number=' + str(geneNumber) + '\n') - documentBuffer += ('FT /gene="' + str(self.inputGene) + '"\n') - documentBuffer += ('FT /allele="' + str(self.inputAllele) + '"\n') + featureText += ('FT intron ' + str(currentFeature.beginIndex) + + '..' + str(currentFeature.endIndex) + '\n') + featureText += ('FT /number=' + str(intronIndex) + '\n') + featureText += ('FT /gene="' + str(self.inputGene) + '"\n') + featureText += ('FT /allele="' + str(self.inputAllele) + '"\n') + intronIndex += 1 + + + featureText += ('XX\n') + + # Do a quick sanity check. If we are missing either UTR I should warn the user. + # But move on with your life, this is not worth getting upset over. + if (not geneHas3UTR and not geneHas5UTR): + tkMessageBox.showinfo('Missing UTRs', + 'This sequence has no 5\' or 3\' UTR.\n\n' + + 'Use lowercase nucleotides at the\n' + + 'beginning and end of your DNA\n' + + 'sequence to specify the 5\' and 3\' UTRs.' ) + elif (not geneHas5UTR): + tkMessageBox.showinfo('Missing 5\' UTR', + 'This sequence has no 5\' UTR.\n\n' + + 'Use lowercase nucleotides at the\n' + + 'beginning and end of your DNA\n' + + 'sequence to specify the 5\' and 3\' UTRs.' ) + elif (not geneHas3UTR): + tkMessageBox.showinfo('Missing 3\' UTR', + 'This sequence has no 3\' UTR.\n\n' + + 'Use lowercase nucleotides at the\n' + + 'beginning and end of your DNA\n' + + 'sequence to specify the 5\' and 3\' UTRs.' ) + else: + print('The UTRs look fine.') + pass + + return featureText + + def printSequence(self): + sequenceText = '' + + completeSequence = self.sequenceAnnotation.getCompleteSequence().upper() + cCount = completeSequence.count('C') + gCount = completeSequence.count('G') + tCount = completeSequence.count('T') + aCount = completeSequence.count('A') + otherCount = self.sequenceAnnotation.totalLength() - (cCount + gCount + tCount + aCount) - # Print 3'UTR - utr = self.sequenceAnnotation.loci[len(self.sequenceAnnotation.loci)-1] - documentBuffer += ('FT 3\'UTR ' + str(utr.beginIndex) + '..' + str(utr.endIndex) + '\n') - documentBuffer += ('FT /note=\"3\'UTR\"\n') - documentBuffer += ('FT /gene="' + str(self.inputGene) + '"\n') - documentBuffer += ('FT /allele="' + str(self.inputAllele) + '"\n') - documentBuffer += ('XX\n') - - # Print sequence - # There's a sweet biopython method which can count the nucleotides. - # Bio.Seq.count('A') - # I didn't use it. - cCount = 0 - gCount = 0 - tCount = 0 - aCount = 0 - otherCount = 0 - for nucleotide in completeSequence: - if nucleotide == 'C': - cCount+=1 - elif nucleotide == 'G': - gCount+=1 - elif nucleotide == 'T': - tCount+=1 - elif nucleotide == 'A': - aCount+=1 - else: - otherCount+=1 - - documentBuffer += ('SQ Sequence ' + str(totalLength) + ' BP; ' + sequenceText += ('SQ Sequence ' + str(self.sequenceAnnotation.totalLength()) + ' BP; ' + str(aCount) + ' A; ' + str(cCount) + ' C; ' + str(gCount) + ' G; ' + str(tCount) + ' T; ' + str(otherCount) + ' other;\n') # Here's some logic to print the sequence information in groups of 10. # This format is specified in the User manual specified by EMBL. - rowCount = 0 - columnCount = 0 currentSeqIndex = 0 - while (currentSeqIndex < totalLength): + while (currentSeqIndex < self.sequenceAnnotation.totalLength()): # The character code for a sequence region is two blank spaces, # followed by three blank spaces, for a total of 5 blanks. - documentBuffer += ' ' - sequenceRow = completeSequence[currentSeqIndex : currentSeqIndex + 60] + sequenceText += ' ' + sequenceRow = self.sequenceAnnotation.getCompleteSequence()[currentSeqIndex : currentSeqIndex + 60] # A sequenceChunk is 10 nucleotides in this context. # Format specifies up to six "chunks" per line. for i in range(0,6): sequenceChunk = sequenceRow[i*10 : (i+1)*10] - documentBuffer += sequenceChunk + ' ' + sequenceText += sequenceChunk + ' ' # If line is complete (=60 bp), we can print the nucleotide index and move on to the next row. if(len(sequenceRow) == 60): - documentBuffer += str(currentSeqIndex + 60) + '\n' + sequenceText += str(currentSeqIndex + 60) + '\n' # but if line is not complete (this is more likely, and more complicated.) else: # Fill with spaces to align the nucleotide indices at the end of the sequence. numberSpaces = 60-len(sequenceRow) for n in range (0, numberSpaces): - documentBuffer += ' ' - documentBuffer += (str(len(sequenceRow) + currentSeqIndex) + '\n') + sequenceText += ' ' + sequenceText += (str(len(sequenceRow) + currentSeqIndex) + '\n') # The next row of the sequence currentSeqIndex += 60 - - # Print entry terminator. The last line of an ENA entry. - documentBuffer += ('//\n') + + return sequenceText + + + # Create the text submission based on the ENA format. + def buildENASubmission(self): + + # ENA format is the preferred submission type for EMBL. More information: + # http://www.ebi.ac.uk/ena/submit/sequence-submission + # http://www.ebi.ac.uk/ena/submit/entry-upload-templates + # ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/usrman.txt + # ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/FT_current.html + # http://www.ebi.ac.uk/ena/software/flat-file-validator + + documentBuffer = '' + + totalLength = self.sequenceAnnotation.totalLength() + print('total calculated length = ' + str(totalLength)) + + if(totalLength > 0): + + # These are the main sections of the ENA submission. + documentBuffer += self.printHeader() + documentBuffer += self.printMRNA() + documentBuffer += self.printCDS() + documentBuffer += self.printFeatures() + documentBuffer += self.printSequence() + + # Print entry terminator. The last line of an ENA entry. + documentBuffer += ('//\n') + + else: + tkMessageBox.showinfo('No HLA Sequence Found', + 'The HLA sequence is empty.\nPlease fill in an annotated HLA sequence\nbefore generating the submission.' ) + + pass + return documentBuffer diff --git a/src/AlleleGui.py b/src/AlleleGui.py index 7c94324..1a99985 100755 --- a/src/AlleleGui.py +++ b/src/AlleleGui.py @@ -14,7 +14,7 @@ # along with EMBL-HLA-Submission. If not, see . # Version 1.0 -SoftwareVersion = "EMBL-HLA-Submission Version 1.0" +SoftwareVersion = "Bhast Version 1.0" import os @@ -30,42 +30,42 @@ class AlleleGui(Tkinter.Frame): # Initialize the GUI def __init__(self, root): Tkinter.Frame.__init__(self, root) - root.title("EMBL Novel HLA Allele Submission Tool") + root.title("Bhast - A Novel HLA Allele Submission Generator") self.parent = root + + # Ctrl-A doesn't work by default in TK. I guess I need to do it myself. + root.bind_class("Text","", self.selectall) + self.initialize() - + + # I shouldn't need to write a select-All method but TK is kind of annoying. + def selectall(self, event): + + event.widget.tag_add("sel","1.0","end") + # Initialize GUI elements def initialize(self): button_opt = {'fill': Tkconstants.BOTH, 'padx': 35, 'pady': 5} - self.cellNumInstrText = Tkinter.StringVar() self.cellNumInstrText.set('Sample ID:') self.inputCellNummer = Tkinter.StringVar() - self.inputCellNummer.set('11111') self.geneInstrText = Tkinter.StringVar() self.geneInstrText.set('Gene:') self.inputGene = Tkinter.StringVar() - self.inputGene.set('HLA-C') self.alleleInstrText = Tkinter.StringVar() self.alleleInstrText.set('Allele:') self.inputAllele = Tkinter.StringVar() - self.inputAllele.set('C0213ext') - #self.inputFeature = Tkinter.StringVar() - #self.inputFeature.set('AGC[AGT]CCG[GGC]AAT') self.featureInstrText = Tkinter.StringVar() self.featureInstrText.set('Annotated Sequence:') self.outputEMBLSubmission = Tkinter.StringVar() self.outputEMBLSubmission.set('Resulting Allele Submission:') - #Moving this to the bottom - #Tkinter.Label(self, width=85, height=3, textvariable=self.instructionText).pack() - Tkinter.Label(self, width=80, height=1, textvariable=self.cellNumInstrText).pack() Tkinter.Entry(self, width=15, textvariable=self.inputCellNummer).pack() @@ -98,11 +98,7 @@ def initialize(self): self.featureInputGuiObject.pack() self.featureInputFrame.pack() - self.featureInputGuiObject.delete('1.0','end') - self.featureInputGuiObject.insert('1.0', 'aag\nCGTCGT\nccg\nGGCTGA\naat') - - #Tkinter.Button(self, text='\|/ Generate an EMBL submission \|/', command=self.updateGUI).pack(**button_opt) - Tkinter.Button(self, text=unichr(8681) + ' Generate an EMBL submission ' + unichr(8681), command=self.updateGUI).pack(**button_opt) + Tkinter.Button(self, text=unichr(8681) + ' Generate an EMBL submission ' + unichr(8681), command=self.constructSubmission).pack(**button_opt) Tkinter.Label(self, width=80, height=1, textvariable=self.outputEMBLSubmission).pack() @@ -136,56 +132,61 @@ def initialize(self): Tkinter.Button(self, text='Save this submission to my computer', command=self.saveSubmissionFile).pack(**button_opt) self.instructionText = Tkinter.StringVar() - #self.instructionText.set('This tool assumes you are submitting a standard HLA allele.\n' - # + 'HLA alleles are assumed to be fully sequenced, including 5\' and 3\' UTRs.\n' - # + 'Use capital letters for exons, lowercase for introns & UTRs, like this:\n' - # + 'five\'utr EXON1 intron1 EXON2 ... EXON{X} three\'utr\n' - # + 'All spaces, tabs, and newlines are discarded and ignored.') self.instructionText.set('This tool was developed by the Tissue Typing Laboratory at\nMaastricht University Medical Center.\nFor more information:') Tkinter.Label(self, width=85, height=3, textvariable=self.instructionText).pack() - - + # Make a frame for the more-info buttons self.moreInfoFrame = Tkinter.Frame(self) Tkinter.Button(self.moreInfoFrame, text='How to use this tool', command=self.howToUse).grid(row=0, column=0) Tkinter.Button(self.moreInfoFrame, text='Contacting or Citing MUMC', command=self.contactInformation).grid(row=0, column=1) + Tkinter.Button(self.moreInfoFrame, text='Example Sequence', command=self.sampleSequence).grid(row=0, column=2) - - self.moreInfoFrame.pack() - - self.updateGUI() - def howToUse(self): - # This method should popup some instruction text in a wee window. + def sampleSequence(self): + self.featureInputGuiObject.delete('1.0','end') + self.featureInputGuiObject.insert('1.0', 'aag\nCGTCGT\nccg\nGGCTGA\naat') + + self.inputAllele.set('Allele:01:02') + self.inputGene.set('HLA-C') + self.inputCellNummer.set('Donor_12345') - #self.instructionText.set('This tool assumes you are submitting a standard HLA allele.\n' - # + 'HLA alleles are assumed to be fully sequenced, including 5\' and 3\' UTRs.\n' - # + 'Use capital letters for exons, lowercase for introns & UTRs, like this:\n' - # + 'five\'utr EXON1 intron1 EXON2 ... EXON{X} three\'utr\n' - # + 'All spaces, tabs, and newlines are discarded and ignored.') + self.constructSubmission() + # This method should popup some instruction text in a wee window. + # This should be explicit on how to use the tool. + def howToUse(self): tkMessageBox.showinfo('How to use this tool', 'This software is to be used to create an\n' + 'EMBL-formatted submission document,\n' - + 'which specifies a novel HLA allele,\n' - + 'including exon/intron annotation.\n\n' + + 'which specifies a (novel) HLA allele.\n\n' - + 'This tool assumes you are submitting a\n' - + 'full length HLA allele.\n' - + 'HLA alleles should be fully sequenced,\n' - + 'including 5\' and 3\' UTRs.\n' + + 'This tool requires you to submit a\n' + + 'full length HLA allele, including\n' + + '5\' and 3\' UTRs.\n\n' + + 'Use capital letters for exons,\n' + 'lowercase for introns & UTRs.\n\n' - + 'An example is included in the form,\n' + + 'Push the "Example Sequence" button to see a small example of' + + ' a formatted sequence.\n' + 'Sequences should follow this pattern:\n' + '5\'utr EX1 int1 EX2 ... EX{X} 3\'utr\n\n' - + 'All spaces, tabs, and newlines are\n' - + 'removed and ignored.' + + 'To use this tool:\n' + + '1.) Fill in a Sample ID, Gene Name, and Allele.' + + ' This text will be included in the submission.\n' + + '2.) Paste your formatted sequence in the\n' + + 'Annotated Sequence text area.\n' + + '3.) Push \"Generate an EMBL submission\" button' + + ' to generate a submission.\n' + + '4.) Push the "Save the submission" button' + + ' to store the submission on your computer.\nYou can submit this file to EMBL.\n\n' + + + 'All spaces, tabs, and newlines are' + + ' removed before the nucleotide sequence is translated.' ) def contactInformation(self): @@ -221,12 +222,13 @@ def saveSubmissionFile(self): options['initialdir'] = self.idir options['parent'] = self options['title'] = 'Specify your output file.' + options['initialfile'] = 'NovelAlleleEMBLSubmission.txt' outputFileObject = tkFileDialog.asksaveasfile(**self.dir_opt) submissionText = self.submOutputGuiObject.get('1.0', 'end') outputFileObject.write(submissionText) # Gather sequence information from the input elements, and generate a text EMBL submission. - def updateGUI(self): + def constructSubmission(self): allGen = AlleleGenerator() roughFeatureSequence = self.featureInputGuiObject.get('1.0', 'end') diff --git a/src/AlleleSubmissionEMBL.py b/src/AlleleSubmissionEMBL.py index 1ffa0b5..6070ddb 100755 --- a/src/AlleleSubmissionEMBL.py +++ b/src/AlleleSubmissionEMBL.py @@ -15,7 +15,7 @@ # Version 1.0 -SoftwareVersion = "EMBL-HLA-Submission Version 1.0" +SoftwareVersion = "Bhast Version 1.0" import Tkinter import sys @@ -24,9 +24,9 @@ if __name__=='__main__': try: - # This is a really simple way to read commandline args, # because there really shouldn't be any. + # TODO: Be more graceful with this, there are better ways to read args. # No parameters are expected at all. sys.argv[0] doesn't count. if (len(sys.argv) == 1): @@ -36,7 +36,7 @@ AlleleGui(root).pack() root.mainloop() - print('Done. Yay.') + print('Done. Hooray.') # Print the Software Version elif (len(sys.argv) == 2 and ( diff --git a/src/HLAGene.py b/src/HLAGene.py index 9b2b7ca..69565e3 100755 --- a/src/HLAGene.py +++ b/src/HLAGene.py @@ -17,13 +17,14 @@ # The GeneLocus class specifies a locus on a Gene, # Either an Exon, intron, or UTR. -class GeneLocus(): - - name = '' - sequence = '' - exon = False - beginIndex = 0 - endIndex = 0 +class GeneLocus(): + + def __init__(self): + self.name = '' + self.sequence = '' + self.exon = False + self.beginIndex = 0 + self.endIndex = 0 def length(self): return 1 + self.endIndex - self.beginIndex @@ -31,16 +32,15 @@ def length(self): # The Gene class represents an entire HLA Gene, consisting of a series of loci. class HLAGene(): - fullSequence = '' - loci = [] + def __init__(self): + self.fullSequence = '' + self.loci = [] def totalLength(self): - return len(self.getCompleteSequence()) # Combine the UTRs, Exons, and Introns into a contiguous sequence. def getCompleteSequence(self): - sequence='' for i in range(0, len(self.loci)): sequence += self.loci[i].sequence @@ -50,7 +50,7 @@ def getCompleteSequence(self): def getExonSequence(self): sequence='' - for i in range(1, len(self.loci)-1): + for i in range(0, len(self.loci)): if(self.loci[i].exon): sequence += self.loci[i].sequence return sequence @@ -58,38 +58,50 @@ def getExonSequence(self): # This method names the UTRs, Exons, and Introns, and records their indices. # A HLA gene is always expected to have the pattern # # 5UT -> EX1 -> IN1 -> EX2 -> IN2 -> ... -> EXN -> 3UT + # But I can't assume this so I will check. def annotateLoci(self): print('Annotating Gene Now') + # An index for the nucleotide position lociBeginIndex = 1 - if(len(self.loci) > 2): - for x in range(0, len(self.loci)): - - # Determine the name of this loci. - # 5UT -> EX1 -> IN1 -> EX2 -> IN2 -> ... -> EXN -> 3UT - if(x==0): - self.loci[x].name = '5UT' - elif(x==len(self.loci)-1): - self.loci[x].name = '3UT' - elif(x%2 == 1): - self.loci[x].name = 'EX' + str(x/2 + 1) - else: - self.loci[x].name = 'I' + str(x/2) + + # Start with the names 'EX1' and 'IN1' + exonIndex = 1 + intronIndex = 1 + + lociCount = len(self.loci) + + if(lociCount == 0): + print('No loci to annotate. We are done here.') + # TODO: do I need to raise an exception or just ignore the problem? + + else: + for lociIndex in range(0, len(self.loci)): + + if (self.loci[lociIndex].exon): + self.loci[lociIndex].name = 'EX' + str(exonIndex) + exonIndex += 1 + else : + # Is it a UTR or an Intron? + if (lociIndex == 0): + self.loci[lociIndex].name = '5UT' + elif (lociIndex == lociCount - 1): + self.loci[lociIndex].name = '3UT' + else: + self.loci[lociIndex].name = 'I' + str(intronIndex) + intronIndex += 1 # Determine start and end indices of these exons. # Attempting to make index that looks like: #5UT: 1-65 #EX1: 66-137 # I1: 138-267 - self.loci[x].beginIndex = lociBeginIndex - lociBeginIndex += len(self.loci[x].sequence) - self.loci[x].endIndex = lociBeginIndex - 1 - - - else: - print('I expected at least three loci in order to annotate them. Please double check your input file.') - + self.loci[lociIndex].beginIndex = lociBeginIndex + lociBeginIndex += len(self.loci[lociIndex].sequence) + self.loci[lociIndex].endIndex = lociBeginIndex - 1 + + # Print a summary of the inputted sequence to console. def printGeneSummary(self): print('\nPrinting Gene Summary')