From 142810e108f374840ff8a6d9b002441b6232b0a7 Mon Sep 17 00:00:00 2001
From: Ben <ben.matern@gmail.com>
Date: Fri, 24 Mar 2017 11:14:03 +0100
Subject: [PATCH 1/3] changed the error messages

---
 src/AlleleGenerator.py      | 542 ++++++++++++++++++++----------------
 src/AlleleGui.py            |  23 +-
 src/AlleleSubmissionEMBL.py |   2 +-
 src/HLAGene.py              |  78 +++---
 4 files changed, 371 insertions(+), 274 deletions(-)

diff --git a/src/AlleleGenerator.py b/src/AlleleGenerator.py
index 410719b..cdba97b 100755
--- a/src/AlleleGenerator.py
+++ b/src/AlleleGenerator.py
@@ -17,6 +17,7 @@
 
 from Bio.Seq import Seq
 from Bio.Alphabet import generic_dna
+import sys
 import tkMessageBox
 
 import math
@@ -26,63 +27,73 @@
 # The AlleleGenerator class contains logic to generate an EMBL HLA allele submission 
 # In ENA format.  
 class AlleleGenerator():   
+    
+    def __init__(self):
  
-    inputFileName = ''
-    outputFileName = ''
-    sequenceAnnotation = HLAGene()
-    inputCellNummer = 0#12345
-    inputGene = ''#HLA-C'
-    inputAllele = ''#C0316ext'  
+        self.inputFileName = ''
+        self.outputFileName = ''
+        self.sequenceAnnotation = HLAGene()
+        self.inputCellNummer = 0#12345
+        self.inputGene = ''#HLA-C'
+        self.inputAllele = ''#C0316ext'  
 
     # This is a short wrapper method to use biopython's translation method. 
     def translateSequence(self,inputSequence):
 
-        coding_dna = Seq(inputSequence, generic_dna)
-        
-        peptideSequence = str(coding_dna.translate())
-        print ('Translated Protein:' + peptideSequence)
-        
-        #Stop codon *should* be at the end of the protein.  
-        stopCodonLocation = peptideSequence.find('*')
+        try:
+            coding_dna = Seq(inputSequence, generic_dna)        
+            proteinSequence = str(coding_dna.translate())   
+            print ('Exon Sequence before translation:' + coding_dna)     
+            print ('Translated Protein:' + proteinSequence)
+            
+            # Perform Sanity Checks.
+            # Stop codon *should* be at the end of the protein.  
+            stopCodonLocation = proteinSequence.find('*')
+            
+            # If no stop codon was found
+            if (stopCodonLocation == -1):
+                if(len(coding_dna) % 3 == 0):
+                    tkMessageBox.showinfo('No Stop Codon Found', 
+                        'The translated protein does not contain a stop codon.' )
+                else:
+                    tkMessageBox.showinfo('No Stop Codon Found', 
+                        'The translated protein does not contain a stop codon.\n' + 
+                        'The coding nucleotide sequence length (' + str(len(coding_dna))  + ') is not a multiple of 3.')
+            
+            # This happens if the coding sequence is the wrong length
+            elif (not len(coding_dna) % 3 == 0):
+                tkMessageBox.showinfo('Check your coding sequence length.', 
+                    'The coding nucleotide sequence length (' + str(len(coding_dna))  + ') is not a multiple of 3.')
         
-        if (stopCodonLocation == -1):
-            if(len(coding_dna) % 3 == 0):
-                tkMessageBox.showinfo('No Stop Codon Found', 
-                    'The translated protein does not contain a stop codon.' )
+            # This is normal and expected.
             else:
-                tkMessageBox.showinfo('No Stop Codon Found', 
-                    'The translated protein does not contain a stop codon.\n' + 
-                    'It looks like a frame shift,\n' +
-                    'The coding nucleotide sequence has length: ' + str(len(coding_dna)) + '.')
-        else:
-            if (stopCodonLocation == len(peptideSequence) - 1):
-                #Stop codon is the last character in the peptide sequence.  That's just fine, but trim off the stop codon.
-                peptideSequence = peptideSequence[0:stopCodonLocation]
-                pass
-            else:
-                tkMessageBox.showinfo('Premature Stop Codon Detected',
-                    'Premature stop codon found:\nPeptide Position (' + 
-                    str(stopCodonLocation + 1) + '/' +
-                    str(len(peptideSequence)) + ')\n\n' +
-                    'Double check your peptide sequence,\n' + 
-                    'Some aminos from the 3\' / C-Terminus\nwere spliced out.\n\n' + 
-                    'Before : ' + peptideSequence + 
-                    '\nAfter  : ' + peptideSequence[0:stopCodonLocation] + 
-                    '\n'
-                    )
-                peptideSequence = peptideSequence[0:stopCodonLocation]
-
-        return peptideSequence
-
-    # 
-    def readInputSequence(self):
-
-        print('Reading file: ' + self.inputFileName)
-
-        fileObject = open(self.inputFileName, 'r')        
-        fullFile = fileObject.read()
-
-        self.processInputSequence(fullFile)        
+                if (stopCodonLocation == len(proteinSequence) - 1):
+                    # Stop codon is the last character in the peptide sequence.  
+                    # That's just fine, but trim off the stop codon.
+                    proteinSequence = proteinSequence[0:stopCodonLocation]
+                    pass
+                else:
+                    tkMessageBox.showinfo('Premature Stop Codon Detected',
+                        'Premature stop codon found:\nProtein Position (' + 
+                        str(stopCodonLocation + 1) + '/' +
+                        str(len(proteinSequence)) + ')\n\n' +
+                        'Double check your protein sequence,\n' + 
+                        'because some aminos from the\n3\' / C-Terminus end\nwere spliced out.\n\n' + 
+                        'Protein Before Splicing:\n' + proteinSequence + 
+                        '\n\nProtein After Splicing:\n' + proteinSequence[0:stopCodonLocation] + 
+                        '\n'
+                        )
+                    proteinSequence = proteinSequence[0:stopCodonLocation]
+    
+            return proteinSequence
+        
+        except Exception:
+            print 'Problem when translating protein:'
+            print sys.exc_info()[1]
+            tkMessageBox.showinfo('Protein Translation Error', 
+                'I could not translate your protein:\n' +  str(sys.exc_info()[1]))
+            
+            raise
 
     # The input file should be a string of nucleotides, with capital letters to identify exons and introns.
     # Annotations are expected and read in this format:
@@ -92,268 +103,327 @@ def readInputSequence(self):
     def processInputSequence(self, inputSequenceText):
 
         resultGeneLoci = HLAGene()
-        # Why do I need to initialize loci array?  I would have thought calling Gene() would do it.  
-        resultGeneLoci.loci = []
-
+        
         # Trim out any spaces, tabs, newlines.  Uppercase.
         cleanedGene = inputSequenceText.replace(' ','').replace('\n','').replace('\t','').replace('\r','')
 
-        # Trim out the annotation marks, and capitalize, so I have a copy of the full sequence.
+        # Capitalize, so I can store a copy of the full unannotated sequence.
         unannotatedGene = cleanedGene.upper()
         resultGeneLoci.fullSequence = unannotatedGene
         print('Total Sequence Length = ' + str(len(unannotatedGene)))
 
         # Loop through the cleaned and annotated input sequence, 
-        # to search for exon annotation characters ( '[' and ']' )
-        # I assume that the first and last loci are the 5' and 3' UTR, 
-        # I assume that Exons and Introns will alternate beyond that.
-        # It no longer uses ( '[' and ']' ) to specify exons.  I check for 
         # capitals and lowercase letters to determine exon start and end
-        insideAnExon = False
-        locusBeginPosition = 0
-        for x in range(0, len(cleanedGene)):
-            currentChar = cleanedGene[x]
+        if(len(cleanedGene) > 0):
             
-            # Is this a standard nucleotide character?
-            if(currentChar.upper() in ('A','G','C','T')):
-
-                if(currentChar.isupper()):
-                    if(insideAnExon):
-                        #We're STILL in an exon.  In this case, I should just do nothing and continue.  
-                        pass
+            # Is the first feature an exon or an intron?
+            # If we begin in an Exon
+            if( cleanedGene[0] in ('A','G','C','T')):                
+                insideAnExon = True
+            # If we begin in an Intron/UTR
+            elif( cleanedGene[0] in ('a','g','c','t')):  
+                insideAnExon = False
+            else:
+                # Nonstandard nucleotide? I should start panicking.
+                #raise Exception('Nonstandard Nucleotide, not sure how to handle it')
+                print('Nonstandard Nucleotide at the beginning of the sequence, not sure how to handle it')
+                insideAnExon = False
+            
+            
+            locusBeginPosition = 0
+            for x in range(0, len(cleanedGene)):
+                currentChar = cleanedGene[x]
+                
+                # Is this a standard nucleotide character?
+                if(currentChar.upper() in ('A','G','C','T')):
+    
+                    if(currentChar.isupper()):
+                        if(insideAnExon):
+                            #We're STILL in an exon.  In this case, I should just do nothing and continue.  
+                            pass
+                        else:
+                            #In this case, we're just starting an EXON.
+                            #Store the last Intron in the list.
+                            currentIntron = GeneLocus()
+                            currentIntron.sequence = cleanedGene[locusBeginPosition:x].upper()
+                            currentIntron.exon = False
+                            resultGeneLoci.loci.append(currentIntron)                    
+                            insideAnExon=True
+                            locusBeginPosition = x
+                            pass
+                            
                     else:
-                        #In this case, we're just starting an EXON.
-                        #Store the last Intron in the list.
-                        currentIntron = GeneLocus()
-                        currentIntron.sequence = cleanedGene[locusBeginPosition:x].upper()
-                        currentIntron.exon = False
-                        resultGeneLoci.loci.append(currentIntron)                    
-                        insideAnExon=True
-                        locusBeginPosition = x
-                        pass
-                        
+                        if not (insideAnExon):
+                            #We're STILL in an intron.  Continue.
+                            pass
+                        else:
+                            #Starting a new Intron.
+                            # Store an Exon in the list.
+                            currentExon = GeneLocus()
+                            currentExon.sequence = cleanedGene[locusBeginPosition:x].upper()
+                            currentExon.exon = True
+                            resultGeneLoci.loci.append(currentExon)     
+                            insideAnExon = False
+                            locusBeginPosition=x
+                            pass
                 else:
-                    if not (insideAnExon):
-                        #We're STILL in an intron.  Continue.
-                        pass
-                    else:
-                        # Store an Exon in the list.
-                        currentExon = GeneLocus()
-                        currentExon.sequence = cleanedGene[locusBeginPosition:x].upper()
-                        currentExon.exon = True
-                        resultGeneLoci.loci.append(currentExon)     
-                        insideAnExon = False
-                        locusBeginPosition=x
-
-                        #Starting a new Intron.
-                        pass
-            else:
-                print('Nonstandard nucleotide detected at position ' + str(x) + ' : ' + currentChar 
-                    + '.  If this is a wildcard character, you might be ok.')
-
-        # Store the last(3') UTR as an intron.
-        currentIntron = GeneLocus()
-        currentIntron.sequence = cleanedGene[locusBeginPosition:len(cleanedGene)].upper()
-        currentIntron.exon = False
-        resultGeneLoci.loci.append(currentIntron)    
-
-        # Annotate the loci (name them) and print the results of the read file.
-        resultGeneLoci.annotateLoci()
-        resultGeneLoci.printGeneSummary()
-
-        self.sequenceAnnotation = resultGeneLoci
-
-    # Create the text submission based on the ENA format.
-    def buildENASubmission(self):    
+                    print('Nonstandard nucleotide detected at position ' + str(x) + ' : ' + currentChar 
+                        + '.  If this is a wildcard character, you might be ok.')
+    
+            # We've reached the end of the loop and we still need to store the last feature.
+            # Should be a 3' UTR, but I can't be sure, people like to put in weird sequences.
+            currentIntron = GeneLocus()
+            currentIntron.sequence = cleanedGene[locusBeginPosition:len(cleanedGene)].upper()
+            currentIntron.exon = insideAnExon
+            resultGeneLoci.loci.append(currentIntron)    
+    
+            # Annotate the loci (name them) and print the results of the read file.
+            resultGeneLoci.annotateLoci()
+            resultGeneLoci.printGeneSummary()
         
-        # ENA format is the preferred submission type for EMBL.  More information:
-        # http://www.ebi.ac.uk/ena/submit/sequence-submission
-        # http://www.ebi.ac.uk/ena/submit/entry-upload-templates
-        # ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/usrman.txt
-        # ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/FT_current.html
-        # http://www.ebi.ac.uk/ena/software/flat-file-validator
-
-        documentBuffer = ''
-
-        # These variables are for test data, they should be filled in by GUI.
-        #self.inputCellNummer = 23445
-        #self.inputGene = 'HLA-C'
-        #self.inputAllele = 'C0316ext'        
+        # If the sequence is empty
+        else:
+            print('Empty sequence, I don\'t have anything to do.')
+            
+        self.sequenceAnnotation = resultGeneLoci
 
-        completeSequence = self.sequenceAnnotation.getCompleteSequence()
-        exonSequence = self.sequenceAnnotation.getExonSequence()
-        totalLength = self.sequenceAnnotation.totalLength()
-        featureCount = len(self.sequenceAnnotation.loci)
-        print('total calculated length = ' + str(totalLength))
+    
 
+    def printHeader(self):
+        headerText = ''
+        
         # Print header
-        documentBuffer += ('ID   XXX; XXX; linear; genomic DNA; XXX; XXX; ' + str(totalLength) + ' BP.\n')
-        documentBuffer += ('XX\n')
-
+        headerText += 'ID   XXX; XXX; linear; genomic DNA; XXX; XXX; ' + str(self.sequenceAnnotation.totalLength()) + ' BP.\n'
+        headerText += 'XX\n'
         # A valid document should have an AC (Accession Number) and DE (Description) field.
         # I don't have an AC number available, so it's blank.
-        documentBuffer += ('AC   \n')
-        documentBuffer += ('XX\n')
-
-        documentBuffer += ('DE   Human Leukocyte Antigen\n')
-        documentBuffer += ('XX\n')
+        headerText += 'AC   \n'
+        headerText += 'XX\n'
+        headerText += 'DE   Human Leukocyte Antigen\n'
+        headerText += 'XX\n'
 
         # Print key
-        documentBuffer += ('FH   Key             Location/Qualifiers\n')
-        documentBuffer += ('FH\n')
-
+        headerText += ('FH   Key             Location/Qualifiers\n')
+        headerText += ('FH\n')
+        
         # Print source
         # It's from a human.
-        documentBuffer += ('FT   source          1..' + str(totalLength) + '\n')
-        documentBuffer += ('FT                   /organism="Homo sapiens"\n')
-        documentBuffer += ('FT                   /db_xref="taxon:9606"\n')
-        documentBuffer += ('FT                   /mol_type="genomic DNA"\n')
-        documentBuffer += ('FT                   /chromosome="6"\n')
-        documentBuffer += ('FT                   /isolate="' + str(self.inputCellNummer) + '"\n')    
-
+        headerText += ('FT   source          1..' + str(self.sequenceAnnotation.totalLength()) + '\n')
+        headerText += ('FT                   /organism="Homo sapiens"\n')
+        headerText += ('FT                   /db_xref="taxon:9606"\n')
+        headerText += ('FT                   /mol_type="genomic DNA"\n')
+        headerText += ('FT                   /chromosome="6"\n')
+        headerText += ('FT                   /isolate="' + str(self.inputCellNummer) + '"\n')    
+        
+        return headerText
+    
+    def printMRNA(self):
+        mRNAText = ''
         # Print mRNA
-        documentBuffer += ('FT   mRNA            join(')
+        mRNAText += ('FT   mRNA            join(')
+        
         # Iterate through the indices of the UTRs and exons.
         # The 3' and 5' UTR are included in the mRNA
-        # But not in the CDS (coding sequence), since they're untranslated.
-        documentBuffer += (str(self.sequenceAnnotation.loci[0].beginIndex) 
-            + '..' + str(self.sequenceAnnotation.loci[0].endIndex) + ',')
-
-        for x in range(1,featureCount-1):
+        for x in range(0,len(self.sequenceAnnotation.loci)):
             geneLocus = self.sequenceAnnotation.loci[x]
-            if (geneLocus.exon):
-                documentBuffer += str(geneLocus.beginIndex) + '..' + str(geneLocus.endIndex) + ','
+            # If it is an exon or UTR
+            if (geneLocus.exon or 'UT' in geneLocus.name):
+                mRNAText += str(geneLocus.beginIndex) + '..' + str(geneLocus.endIndex) + ','
 
-        documentBuffer += (str(self.sequenceAnnotation.loci[featureCount-1].beginIndex) 
-            + '..' + str(self.sequenceAnnotation.loci[featureCount-1].endIndex) + ')\n')
+        # Trim off the last comma and add a parenthese
+        mRNAText = mRNAText[0:len(mRNAText)-1] + ')\n'
 
-        documentBuffer += ('FT                   /gene="' + str(self.inputGene) + '"\n') 
-        documentBuffer += ('FT                   /allele="' + str(self.inputAllele) + '"\n')  
-        documentBuffer += ('FT                   /product=\"MHC class I antigen\"\n')  
-                
+        mRNAText += ('FT                   /gene="' + str(self.inputGene) + '"\n') 
+        mRNAText += ('FT                   /allele="' + str(self.inputAllele) + '"\n')  
+        mRNAText += ('FT                   /product=\"MHC class I antigen\"\n')  
+        
+        return mRNAText
+    
+    
+    def printCDS(self):
+        cdsText = ''
+        
         # Print CDS
         # CDS is the coding sequence.  It should include the exons, but not the UTRs/Introns
         # The range 1:featureCount-1 will exclude the UTRs.
-        documentBuffer += ('FT   CDS             join(') 
-        for x in range(1,featureCount-1):
+        cdsText += ('FT   CDS             join(') 
+        for x in range(0,len(self.sequenceAnnotation.loci)):
             geneLocus = self.sequenceAnnotation.loci[x]
             if (geneLocus.exon):
-                documentBuffer += str(geneLocus.beginIndex) + '..' + str(geneLocus.endIndex)
-                if not x==featureCount-2:
-                    documentBuffer += ','
+                cdsText += str(geneLocus.beginIndex) + '..' + str(geneLocus.endIndex)
+                if not x==len(self.sequenceAnnotation.loci)-2:
+                    cdsText += ','
                 else:
-                    documentBuffer += ')\n'
+                    cdsText += ')\n'
 
-        documentBuffer += ('FT                   /transl_table=1\n')
-        documentBuffer += ('FT                   /codon_start=1\n')
-        documentBuffer += ('FT                   /gene="' + str(self.inputGene) + '"\n') 
-        documentBuffer += ('FT                   /allele="' + str(self.inputAllele) + '"\n')  
-        documentBuffer += ('FT                   /product=\"MHC class I antigen\"\n')  
-        documentBuffer += ('FT                   /translation=\"')
+        cdsText += ('FT                   /transl_table=1\n')
+        cdsText += ('FT                   /codon_start=1\n')
+        cdsText += ('FT                   /gene="' + str(self.inputGene) + '"\n') 
+        cdsText += ('FT                   /allele="' + str(self.inputAllele) + '"\n')  
+        cdsText += ('FT                   /product=\"MHC class I antigen\"\n')  
+        cdsText += ('FT                   /translation=\"')
 
         # Some simple formatting for the peptide sequence, making it human and computer readable.  
         # 80 peptides per line.  Except the first line, which is 66.
         # 66 is 80-14, where 14 is the length of { /translation=" }
-        peptideSequence = self.translateSequence(exonSequence)
+        peptideSequence = self.translateSequence(self.sequenceAnnotation.getExonSequence())
         if(len(peptideSequence) < 66):
-            documentBuffer += (peptideSequence) + '\"\n'
+            cdsText += (peptideSequence) + '\"\n'
         else:
-            documentBuffer += peptideSequence[0:66] + '\n'
+            cdsText += peptideSequence[0:66] + '\n'
             i=66
             while (i < len(peptideSequence)):
-                documentBuffer += 'FT                   ' + peptideSequence[i:i+80] + '\n'   
+                cdsText += 'FT                   ' + peptideSequence[i:i+80] + '\n'   
                 i += 80
- 
-        # Print 5'UTR        
-        utr = self.sequenceAnnotation.loci[0]
-        documentBuffer += ('FT   5\'UTR           ' + str(utr.beginIndex) + '..' + str(utr.endIndex) + '\n')
-        documentBuffer += ('FT                   /note=\"5\'UTR\"\n')
-        documentBuffer += ('FT                   /gene="' + str(self.inputGene) + '"\n') 
-        documentBuffer += ('FT                   /allele="' + str(self.inputAllele) + '"\n')  
-
-        # Print alternating Ex/Int/Ex
-        for x in range(1,featureCount-1):
+                
+        return cdsText
+    
+    def printFeatures(self):
+        featureText = ''
+        
+        exonIndex = 1
+        intronIndex = 1
+        
+        geneHas3UTR = False
+        geneHas5UTR = False
+            
+        for x in range(0,len(self.sequenceAnnotation.loci)):
             currentFeature = self.sequenceAnnotation.loci[x]
 
-            if(currentFeature.exon):
-                documentBuffer += ('FT   exon            ' + str(currentFeature.beginIndex) 
+            # 3' UTR
+            if(currentFeature.name == '3UT'):
+                featureText += ('FT   3\'UTR           ' + str(currentFeature.beginIndex) + '..' + str(currentFeature.endIndex) + '\n')
+                featureText += ('FT                   /note=\"3\'UTR\"\n')
+                featureText += ('FT                   /gene="' + str(self.inputGene) + '"\n') 
+                featureText += ('FT                   /allele="' + str(self.inputAllele) + '"\n')
+                geneHas3UTR = True  
+                
+            # 5' UTR
+            elif(currentFeature.name == '5UT'):
+                featureText += ('FT   5\'UTR           ' + str(currentFeature.beginIndex) + '..' + str(currentFeature.endIndex) + '\n')
+                featureText += ('FT                   /note=\"5\'UTR\"\n')
+                featureText += ('FT                   /gene="' + str(self.inputGene) + '"\n') 
+                featureText += ('FT                   /allele="' + str(self.inputAllele) + '"\n')
+                geneHas5UTR = True   
+            
+            # Exon
+            elif(currentFeature.exon):
+                featureText += ('FT   exon            ' + str(currentFeature.beginIndex) 
                     + '..' + str(currentFeature.endIndex) + '\n')
+                featureText += ('FT                   /number=' + str(exonIndex) + '\n') 
+                featureText += ('FT                   /gene="' + str(self.inputGene) + '"\n') 
+                featureText += ('FT                   /allele="' + str(self.inputAllele) + '"\n')  
+                exonIndex += 1
+            
+            # Intron
             else:
-                documentBuffer += ('FT   intron          ' + str(currentFeature.beginIndex) 
-                    + '..' + str(currentFeature.endIndex) + '\n')
-
-            geneNumber = int(math.ceil(x / 2.0))
-            documentBuffer += ('FT                   /number=' + str(geneNumber) + '\n') 
-            documentBuffer += ('FT                   /gene="' + str(self.inputGene) + '"\n') 
-            documentBuffer += ('FT                   /allele="' + str(self.inputAllele) + '"\n')  
+                featureText += ('FT   intron          ' + str(currentFeature.beginIndex) 
+                    + '..' + str(currentFeature.endIndex) + '\n')   
+                featureText += ('FT                   /number=' + str(intronIndex) + '\n') 
+                featureText += ('FT                   /gene="' + str(self.inputGene) + '"\n') 
+                featureText += ('FT                   /allele="' + str(self.inputAllele) + '"\n') 
+                intronIndex += 1
+
+       
+        featureText += ('XX\n')
+        
+        # Do a quick sanity check.  If we are missing either UTR I should warn the user.
+        # But move on with your life, this is not worth getting upset over.
+        if (not geneHas3UTR and not geneHas5UTR):
+            tkMessageBox.showinfo('Missing UTRs', 
+                'This sequence has no 5\' or 3\' UTR.\n\n' + 
+                'Use lowercase nucleotides at the\n' + 
+                'beginning and end of your DNA\n' +
+                'sequence to specify the 5\' and 3\' UTRs.' )
+        elif (not geneHas5UTR):
+            tkMessageBox.showinfo('Missing 5\' UTR', 
+                'This sequence has no 5\' UTR.\n\n' + 
+                'Use lowercase nucleotides at the\n' + 
+                'beginning and end of your DNA\n' +
+                'sequence to specify the 5\' and 3\' UTRs.' )            
+        elif (not geneHas3UTR):
+            tkMessageBox.showinfo('Missing 3\' UTR', 
+                'This sequence has no 3\' UTR.\n\n' + 
+                'Use lowercase nucleotides at the\n' + 
+                'beginning and end of your DNA\n' +
+                'sequence to specify the 5\' and 3\' UTRs.' )    
+        else:
+            print('The UTRs look fine.')
+            pass
+        
+        return featureText
+    
+    def printSequence(self):
+        sequenceText = ''
+ 
+        completeSequence = self.sequenceAnnotation.getCompleteSequence().upper()
                 
+        cCount = completeSequence.count('C')
+        gCount = completeSequence.count('G')
+        tCount = completeSequence.count('T')
+        aCount = completeSequence.count('A')
+        otherCount = self.sequenceAnnotation.totalLength() - (cCount + gCount + tCount + aCount)
 
-        # Print 3'UTR
-        utr = self.sequenceAnnotation.loci[len(self.sequenceAnnotation.loci)-1]
-        documentBuffer += ('FT   3\'UTR           ' + str(utr.beginIndex) + '..' + str(utr.endIndex) + '\n')
-        documentBuffer += ('FT                   /note=\"3\'UTR\"\n')
-        documentBuffer += ('FT                   /gene="' + str(self.inputGene) + '"\n') 
-        documentBuffer += ('FT                   /allele="' + str(self.inputAllele) + '"\n')  
-        documentBuffer += ('XX\n')
-
-        # Print sequence
-        # There's a sweet biopython method which can count the nucleotides.
-        # Bio.Seq.count('A')
-        # I didn't use it. 
-        cCount = 0
-        gCount = 0
-        tCount = 0
-        aCount = 0
-        otherCount = 0
-        for nucleotide in completeSequence:
-            if nucleotide == 'C':
-                cCount+=1
-            elif nucleotide == 'G':
-                gCount+=1
-            elif nucleotide == 'T':
-                tCount+=1
-            elif nucleotide == 'A':
-                aCount+=1
-            else:
-                otherCount+=1
-
-        documentBuffer += ('SQ   Sequence ' + str(totalLength) + ' BP; ' 
+        sequenceText += ('SQ   Sequence ' + str(self.sequenceAnnotation.totalLength()) + ' BP; ' 
             + str(aCount) + ' A; ' + str(cCount) + ' C; ' 
             + str(gCount) + ' G; ' + str(tCount) + ' T; ' 
             + str(otherCount) + ' other;\n')
 
         # Here's some logic to print the sequence information in groups of 10.
         # This format is specified in the User manual specified by EMBL.
-        rowCount = 0
-        columnCount = 0
         currentSeqIndex = 0
 
-        while (currentSeqIndex < totalLength):
+        while (currentSeqIndex < self.sequenceAnnotation.totalLength()):
             # The character code for a sequence region is two blank spaces,
             # followed by three blank spaces, for a total of 5 blanks.
-            documentBuffer += '     '
-            sequenceRow = completeSequence[currentSeqIndex : currentSeqIndex + 60]
+            sequenceText += '     '
+            sequenceRow = self.sequenceAnnotation.getCompleteSequence()[currentSeqIndex : currentSeqIndex + 60]
 
             # A sequenceChunk is 10 nucleotides in this context.
             # Format specifies up to six "chunks" per line.
             for i in range(0,6):
                 sequenceChunk = sequenceRow[i*10 : (i+1)*10]
-                documentBuffer += sequenceChunk + ' '
+                sequenceText += sequenceChunk + ' '
 
             # If line is complete (=60 bp), we can print the nucleotide index and move on to the next row.
             if(len(sequenceRow) == 60):
-                documentBuffer += str(currentSeqIndex + 60) + '\n'
+                sequenceText += str(currentSeqIndex + 60) + '\n'
             # but if line is not complete (this is more likely, and more complicated.)
             else:
                 # Fill with spaces to align the nucleotide indices at the end of the sequence.
                 numberSpaces = 60-len(sequenceRow)
                 for n in range (0, numberSpaces):
-                    documentBuffer += ' '
-                documentBuffer += (str(len(sequenceRow) + currentSeqIndex) + '\n')
+                    sequenceText += ' '
+                sequenceText += (str(len(sequenceRow) + currentSeqIndex) + '\n')
             
             # The next row of the sequence
             currentSeqIndex += 60     
-   
+            
+        return sequenceText
+            
+        
+    # Create the text submission based on the ENA format.
+    def buildENASubmission(self):    
+        
+        # ENA format is the preferred submission type for EMBL.  More information:
+        # http://www.ebi.ac.uk/ena/submit/sequence-submission
+        # http://www.ebi.ac.uk/ena/submit/entry-upload-templates
+        # ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/usrman.txt
+        # ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/FT_current.html
+        # http://www.ebi.ac.uk/ena/software/flat-file-validator
+
+        documentBuffer = ''
+
+        totalLength = self.sequenceAnnotation.totalLength()
+        print('total calculated length = ' + str(totalLength))
+
+        # These are the main sections of the ENA submission.
+        documentBuffer += self.printHeader()
+        documentBuffer += self.printMRNA()
+        documentBuffer += self.printCDS()
+        documentBuffer += self.printFeatures()
+        documentBuffer += self.printSequence()
+
         # Print entry terminator.  The last line of an ENA entry.
         documentBuffer += ('//\n')
 
diff --git a/src/AlleleGui.py b/src/AlleleGui.py
index 7c94324..0a63a52 100755
--- a/src/AlleleGui.py
+++ b/src/AlleleGui.py
@@ -32,18 +32,26 @@ def __init__(self, root):
         Tkinter.Frame.__init__(self, root)
         root.title("EMBL Novel HLA Allele Submission Tool")
         self.parent = root
+        
+        # Ctrl-A doesn't work by default in TK.  I guess I need to do it myself.
+        root.bind_class("Text","<Control-a>", self.selectall)
+
         self.initialize()
-    
+        
+    # I shouldn't need to write a select-All method but TK is kind of annoying.
+    def selectall(self, event):
+
+        event.widget.tag_add("sel","1.0","end")
+        
     # Initialize GUI elements
     def initialize(self):
 
         button_opt = {'fill': Tkconstants.BOTH, 'padx': 35, 'pady': 5}
 
-
         self.cellNumInstrText = Tkinter.StringVar()
         self.cellNumInstrText.set('Sample ID:')
         self.inputCellNummer = Tkinter.StringVar()
-        self.inputCellNummer.set('11111')
+        self.inputCellNummer.set('Donor_12345')
 
         self.geneInstrText = Tkinter.StringVar()
         self.geneInstrText.set('Gene:')
@@ -53,7 +61,7 @@ def initialize(self):
         self.alleleInstrText = Tkinter.StringVar()
         self.alleleInstrText.set('Allele:')
         self.inputAllele = Tkinter.StringVar()   
-        self.inputAllele.set('C0213ext')     
+        self.inputAllele.set('Allele:01:02')     
 
         #self.inputFeature = Tkinter.StringVar()
         #self.inputFeature.set('AGC[AGT]CCG[GGC]AAT')
@@ -180,6 +188,13 @@ def howToUse(self):
             + 'Use capital letters for exons,\n'
             + 'lowercase for introns & UTRs.\n\n'
             
+            + 'Paste your formatted sequence in the\n'
+            + 'first text area, and push the button\n'
+            + 'to generate a submission.\n\n'     
+            
+            + 'You can copy the submission from the GUI\n'
+            + 'or save it as text to your computer.\n\n'        
+            
             + 'An example is included in the form,\n'
             + 'Sequences should follow this pattern:\n'
             + '5\'utr EX1 int1 EX2 ... EX{X} 3\'utr\n\n'
diff --git a/src/AlleleSubmissionEMBL.py b/src/AlleleSubmissionEMBL.py
index 1ffa0b5..19beb9b 100755
--- a/src/AlleleSubmissionEMBL.py
+++ b/src/AlleleSubmissionEMBL.py
@@ -24,9 +24,9 @@
 
 if __name__=='__main__':
     try:
-
         # This is a really simple way to read commandline args, 
         # because there really shouldn't be any.
+        # TODO: Be more graceful with this, there are better ways to read args.
 
         # No parameters are expected at all.  sys.argv[0] doesn't count.
         if (len(sys.argv) == 1):
diff --git a/src/HLAGene.py b/src/HLAGene.py
index 9b2b7ca..69565e3 100755
--- a/src/HLAGene.py
+++ b/src/HLAGene.py
@@ -17,13 +17,14 @@
 
 # The GeneLocus class specifies a locus on a Gene, 
 # Either an Exon, intron, or UTR.
-class GeneLocus(): 
-  
-    name = ''
-    sequence = ''
-    exon = False
-    beginIndex = 0
-    endIndex = 0
+class GeneLocus():
+    
+    def __init__(self):
+        self.name = ''
+        self.sequence = ''
+        self.exon = False
+        self.beginIndex = 0
+        self.endIndex = 0
 
     def length(self):
         return 1 + self.endIndex - self.beginIndex     
@@ -31,16 +32,15 @@ def length(self):
 # The Gene class represents an entire HLA Gene, consisting of a series of loci.
 class HLAGene():
 
-    fullSequence = ''
-    loci = []
+    def __init__(self):
+        self.fullSequence = ''
+        self.loci = []
 
     def totalLength(self):
-
         return len(self.getCompleteSequence())
 
     # Combine the UTRs, Exons, and Introns into a contiguous sequence.
     def getCompleteSequence(self):
-
         sequence=''
         for i in range(0, len(self.loci)):
             sequence += self.loci[i].sequence
@@ -50,7 +50,7 @@ def getCompleteSequence(self):
     def getExonSequence(self):
 
         sequence=''
-        for i in range(1, len(self.loci)-1):
+        for i in range(0, len(self.loci)):
             if(self.loci[i].exon):
                 sequence += self.loci[i].sequence
         return sequence
@@ -58,38 +58,50 @@ def getExonSequence(self):
     # This method names the UTRs, Exons, and Introns, and records their indices.
     # A HLA gene is always expected to have the pattern
     # # 5UT -> EX1 -> IN1 -> EX2 -> IN2 -> ... -> EXN -> 3UT
+    # But I can't assume this so I will check.
     def annotateLoci(self):
         
         print('Annotating Gene Now')
 
+        # An index for the nucleotide position
         lociBeginIndex = 1
-        if(len(self.loci) > 2):
-            for x in range(0, len(self.loci)):
-
-                # Determine the name of this loci.  
-                # 5UT -> EX1 -> IN1 -> EX2 -> IN2 -> ... -> EXN -> 3UT
-                if(x==0):
-                    self.loci[x].name = '5UT'
-                elif(x==len(self.loci)-1):
-                    self.loci[x].name = '3UT'
-                elif(x%2 == 1):
-                    self.loci[x].name = 'EX' + str(x/2 + 1)
-                else:
-                    self.loci[x].name = 'I' + str(x/2)
+        
+        # Start with the names 'EX1' and 'IN1'        
+        exonIndex = 1
+        intronIndex = 1
+        
+        lociCount = len(self.loci)
+        
+        if(lociCount == 0):
+            print('No loci to annotate.  We are done here.')
+            # TODO: do I need to raise an exception or just ignore the problem?
+            
+        else:
+            for lociIndex in range(0, len(self.loci)):
+
+                if (self.loci[lociIndex].exon):
+                    self.loci[lociIndex].name = 'EX' + str(exonIndex)
+                    exonIndex += 1
+                else :
+                    # Is it a UTR or an Intron?
+                    if (lociIndex == 0):
+                        self.loci[lociIndex].name = '5UT'
+                    elif (lociIndex == lociCount - 1):
+                        self.loci[lociIndex].name = '3UT'
+                    else:
+                        self.loci[lociIndex].name = 'I' + str(intronIndex)
+                        intronIndex += 1
 
                 # Determine start and end indices of these exons.
                 # Attempting to make index that looks like:
                 #5UT:	1-65
                 #EX1:	66-137
                 # I1:	138-267
-                self.loci[x].beginIndex = lociBeginIndex
-                lociBeginIndex += len(self.loci[x].sequence)
-                self.loci[x].endIndex = lociBeginIndex - 1
-                
-
-        else:
-            print('I expected at least three loci in order to annotate them.  Please double check your input file.')
-
+                self.loci[lociIndex].beginIndex = lociBeginIndex
+                lociBeginIndex += len(self.loci[lociIndex].sequence)
+                self.loci[lociIndex].endIndex = lociBeginIndex - 1
+            
+            
     # Print a summary of the inputted sequence to console.  
     def printGeneSummary(self):
         print('\nPrinting Gene Summary')

From 1f49a1ce73edd520743fd7349123856c51e4eea9 Mon Sep 17 00:00:00 2001
From: Ben <ben.matern@gmail.com>
Date: Tue, 23 May 2017 16:56:15 +0200
Subject: [PATCH 2/3] Interface looks nice.

---
 src/AlleleGenerator.py      | 156 +++++++++++++++++++++++-------------
 src/AlleleGui.py            |  85 +++++++++-----------
 src/AlleleSubmissionEMBL.py |   2 +-
 3 files changed, 138 insertions(+), 105 deletions(-)

diff --git a/src/AlleleGenerator.py b/src/AlleleGenerator.py
index cdba97b..88a1604 100755
--- a/src/AlleleGenerator.py
+++ b/src/AlleleGenerator.py
@@ -33,58 +33,95 @@ def __init__(self):
         self.inputFileName = ''
         self.outputFileName = ''
         self.sequenceAnnotation = HLAGene()
-        self.inputCellNummer = 0#12345
-        self.inputGene = ''#HLA-C'
-        self.inputAllele = ''#C0316ext'  
+        self.inputCellNummer = 0
+        self.inputGene = ''
+        self.inputAllele = '' 
 
     # This is a short wrapper method to use biopython's translation method. 
+    # Most of this code is just checking for things that went wrong
     def translateSequence(self,inputSequence):
 
-        try:
-            coding_dna = Seq(inputSequence, generic_dna)        
-            proteinSequence = str(coding_dna.translate())   
-            print ('Exon Sequence before translation:' + coding_dna)     
-            print ('Translated Protein:' + proteinSequence)
-            
-            # Perform Sanity Checks.
-            # Stop codon *should* be at the end of the protein.  
-            stopCodonLocation = proteinSequence.find('*')
-            
-            # If no stop codon was found
-            if (stopCodonLocation == -1):
-                if(len(coding_dna) % 3 == 0):
-                    tkMessageBox.showinfo('No Stop Codon Found', 
-                        'The translated protein does not contain a stop codon.' )
-                else:
-                    tkMessageBox.showinfo('No Stop Codon Found', 
-                        'The translated protein does not contain a stop codon.\n' + 
-                        'The coding nucleotide sequence length (' + str(len(coding_dna))  + ') is not a multiple of 3.')
-            
-            # This happens if the coding sequence is the wrong length
-            elif (not len(coding_dna) % 3 == 0):
-                tkMessageBox.showinfo('Check your coding sequence length.', 
-                    'The coding nucleotide sequence length (' + str(len(coding_dna))  + ') is not a multiple of 3.')
+        proteinSequence = ''
         
-            # This is normal and expected.
-            else:
-                if (stopCodonLocation == len(proteinSequence) - 1):
-                    # Stop codon is the last character in the peptide sequence.  
-                    # That's just fine, but trim off the stop codon.
-                    proteinSequence = proteinSequence[0:stopCodonLocation]
-                    pass
+        try:
+            # Do nothing if the input sequence is blank.
+            if( len(inputSequence) > 0 ):
+                
+                coding_dna = Seq(inputSequence, generic_dna)        
+                proteinSequence = str(coding_dna.translate())   
+                print ('Exon Sequence before translation:' + coding_dna)     
+                print ('Translated Protein:' + proteinSequence)
+                
+                # Perform Sanity Checks.
+                # Stop codon *should* be at the end of the protein.  
+                # Here we seek out the first instance of a stop codon, 
+                # and remove the peptides afterwards.
+                # because that's what happens in real life.
+                stopCodonLocation = proteinSequence.find('*')
+                
+                # If no stop codon was found
+                if (stopCodonLocation == -1):
+                    # If multiple of three (correct codon length)
+                    if(len(coding_dna) % 3 == 0):
+                        tkMessageBox.showinfo('No Stop Codon Found', 
+                            'The translated protein does not contain a stop codon.' )
+                        
+                    # Wrong Codon Length
+                    else:
+                        tkMessageBox.showinfo('No Stop Codon Found', 
+                            'The translated protein does not contain a stop codon.\n' + 
+                            'The coding nucleotide sequence length (' + str(len(coding_dna))  + ') is not a multiple of 3.')
+
+                # If Stop Codon is in the end of the protein (This is expected and correct)
+                elif (stopCodonLocation == len(proteinSequence) - 1):
+                    # If multiple of three (correct codon length)
+                    if(len(coding_dna) % 3 == 0):
+                        # Everything is fine in this case.  Trim off the stop codon
+                        proteinSequence = proteinSequence[0:stopCodonLocation]
+                        pass 
+                    # Wrong Codon Length
+                    else:
+                        tkMessageBox.showinfo('Extra Nucleotides After the Stop Codon', 
+                            'The stop codon is at the correct position in the protein, but ' + 
+                            'The coding nucleotide sequence length (' + str(len(coding_dna))  + ') is not a multiple of 3.\n\n' +
+                            'Please double check your sequence.')
+                        proteinSequence = proteinSequence[0:stopCodonLocation]
+                                            
+                # Else Stop Codon is premature (before the end of the protein) 
                 else:
-                    tkMessageBox.showinfo('Premature Stop Codon Detected',
-                        'Premature stop codon found:\nProtein Position (' + 
-                        str(stopCodonLocation + 1) + '/' +
-                        str(len(proteinSequence)) + ')\n\n' +
-                        'Double check your protein sequence,\n' + 
-                        'because some aminos from the\n3\' / C-Terminus end\nwere spliced out.\n\n' + 
-                        'Protein Before Splicing:\n' + proteinSequence + 
-                        '\n\nProtein After Splicing:\n' + proteinSequence[0:stopCodonLocation] + 
-                        '\n'
-                        )
-                    proteinSequence = proteinSequence[0:stopCodonLocation]
+                    # If multiple of three (correct codon length)
+                    if(len(coding_dna) % 3 == 0):
+                        tkMessageBox.showinfo('Premature Stop Codon Detected',
+                            'Premature stop codon found:\nProtein Position (' + 
+                            str(stopCodonLocation + 1) + '/' +
+                            str(len(proteinSequence)) + ')\n\n' +
+                            'Double check your protein sequence,\n' + 
+                            'this might indicate a missense mutation.\n\n' + 
+                            'Translated Protein:\n' + proteinSequence + 
+                            '\n\nProtein in EMBL Submission:\n' + proteinSequence[0:stopCodonLocation] + 
+                            '\n'
+                            )
+                        proteinSequence = proteinSequence[0:stopCodonLocation]
     
+    
+                    # Wrong Codon Length
+                    else:
+                        tkMessageBox.showinfo('Premature Stop Codon Detected',
+                            'Premature stop codon found:\nProtein Position (' + 
+                            str(stopCodonLocation + 1) + '/' +
+                            str(len(proteinSequence)) + ')\n\n' +
+                            'Nucleotide count is not a multiple of 3,\n' +
+                            'Double check your protein sequence,\n' + 
+                            'this might indicate a missense mutation.\n\n' + 
+                            'Translated Protein:\n' + proteinSequence + 
+                            '\n\nProtein in EMBL Submission:\n' + proteinSequence[0:stopCodonLocation] + 
+                            '\n'
+                            )
+                        proteinSequence = proteinSequence[0:stopCodonLocation]
+            else:
+                print('Translating a nucleotide sequence of length 0.  That was easy.')
+                pass
+
             return proteinSequence
         
         except Exception:
@@ -416,16 +453,25 @@ def buildENASubmission(self):
 
         totalLength = self.sequenceAnnotation.totalLength()
         print('total calculated length = ' + str(totalLength))
-
-        # These are the main sections of the ENA submission.
-        documentBuffer += self.printHeader()
-        documentBuffer += self.printMRNA()
-        documentBuffer += self.printCDS()
-        documentBuffer += self.printFeatures()
-        documentBuffer += self.printSequence()
-
-        # Print entry terminator.  The last line of an ENA entry.
-        documentBuffer += ('//\n')
+        
+        if(totalLength > 0):
+
+            # These are the main sections of the ENA submission.
+            documentBuffer += self.printHeader()
+            documentBuffer += self.printMRNA()
+            documentBuffer += self.printCDS()
+            documentBuffer += self.printFeatures()
+            documentBuffer += self.printSequence()
+    
+            # Print entry terminator.  The last line of an ENA entry.
+            documentBuffer += ('//\n')
+            
+        else: 
+            tkMessageBox.showinfo('No HLA Sequence Found', 
+                'The HLA sequence is empty.\nPlease fill in an annotated HLA sequence\nbefore generating the submission.' )
+            
+            pass
+        
 
         return documentBuffer
 
diff --git a/src/AlleleGui.py b/src/AlleleGui.py
index 0a63a52..ece5473 100755
--- a/src/AlleleGui.py
+++ b/src/AlleleGui.py
@@ -30,7 +30,7 @@ class AlleleGui(Tkinter.Frame):
     # Initialize the GUI
     def __init__(self, root):
         Tkinter.Frame.__init__(self, root)
-        root.title("EMBL Novel HLA Allele Submission Tool")
+        root.title("AlleleGen - A Novel HLA Allele Submission Generator")
         self.parent = root
         
         # Ctrl-A doesn't work by default in TK.  I guess I need to do it myself.
@@ -51,29 +51,21 @@ def initialize(self):
         self.cellNumInstrText = Tkinter.StringVar()
         self.cellNumInstrText.set('Sample ID:')
         self.inputCellNummer = Tkinter.StringVar()
-        self.inputCellNummer.set('Donor_12345')
 
         self.geneInstrText = Tkinter.StringVar()
         self.geneInstrText.set('Gene:')
         self.inputGene = Tkinter.StringVar()
-        self.inputGene.set('HLA-C')
 
         self.alleleInstrText = Tkinter.StringVar()
         self.alleleInstrText.set('Allele:')
         self.inputAllele = Tkinter.StringVar()   
-        self.inputAllele.set('Allele:01:02')     
 
-        #self.inputFeature = Tkinter.StringVar()
-        #self.inputFeature.set('AGC[AGT]CCG[GGC]AAT')
         self.featureInstrText = Tkinter.StringVar()
         self.featureInstrText.set('Annotated Sequence:')
 
         self.outputEMBLSubmission = Tkinter.StringVar()
         self.outputEMBLSubmission.set('Resulting Allele Submission:')
 
-        #Moving this to the bottom
-        #Tkinter.Label(self, width=85, height=3, textvariable=self.instructionText).pack()
-
         Tkinter.Label(self, width=80, height=1, textvariable=self.cellNumInstrText).pack()
         Tkinter.Entry(self, width=15, textvariable=self.inputCellNummer).pack()
 
@@ -106,11 +98,7 @@ def initialize(self):
         self.featureInputGuiObject.pack() 
         self.featureInputFrame.pack()
 
-        self.featureInputGuiObject.delete('1.0','end')
-        self.featureInputGuiObject.insert('1.0', 'aag\nCGTCGT\nccg\nGGCTGA\naat')
-
-        #Tkinter.Button(self, text='\|/ Generate an EMBL submission \|/', command=self.updateGUI).pack(**button_opt)
-        Tkinter.Button(self, text=unichr(8681) + ' Generate an EMBL submission ' + unichr(8681), command=self.updateGUI).pack(**button_opt)
+        Tkinter.Button(self, text=unichr(8681) + ' Generate an EMBL submission ' + unichr(8681), command=self.constructSubmission).pack(**button_opt)
 
         Tkinter.Label(self, width=80, height=1, textvariable=self.outputEMBLSubmission).pack()
 
@@ -144,63 +132,61 @@ def initialize(self):
         Tkinter.Button(self, text='Save this submission to my computer', command=self.saveSubmissionFile).pack(**button_opt)
          
         self.instructionText = Tkinter.StringVar()       
-        #self.instructionText.set('This tool assumes you are submitting a standard HLA allele.\n'
-        #    + 'HLA alleles are assumed to be fully sequenced, including 5\' and 3\' UTRs.\n'
-        #    + 'Use capital letters for exons, lowercase for introns & UTRs, like this:\n'
-        #    + 'five\'utr EXON1 intron1 EXON2 ... EXON{X} three\'utr\n'
-        #    + 'All spaces, tabs, and newlines are discarded and ignored.')
         self.instructionText.set('This tool was developed by the Tissue Typing Laboratory at\nMaastricht University Medical Center.\nFor more information:')
         Tkinter.Label(self, width=85, height=3, textvariable=self.instructionText).pack()
-  
-  
+    
         # Make a frame for the more-info buttons
         self.moreInfoFrame = Tkinter.Frame(self)
   
         Tkinter.Button(self.moreInfoFrame, text='How to use this tool', command=self.howToUse).grid(row=0, column=0)
         Tkinter.Button(self.moreInfoFrame, text='Contacting or Citing MUMC', command=self.contactInformation).grid(row=0, column=1)
+        Tkinter.Button(self.moreInfoFrame, text='Example Sequence', command=self.sampleSequence).grid(row=0, column=2)
         
-
-
         self.moreInfoFrame.pack()
 
-
-        self.updateGUI()
         
-    def howToUse(self):
-        # This method should popup some instruction text in a wee window.
+    def sampleSequence(self):
+        self.featureInputGuiObject.delete('1.0','end')
+        self.featureInputGuiObject.insert('1.0', 'aag\nCGTCGT\nccg\nGGCTGA\naat')
+        
+        self.inputAllele.set('Allele:01:02')    
+        self.inputGene.set('HLA-C') 
+        self.inputCellNummer.set('Donor_12345')
         
-        #self.instructionText.set('This tool assumes you are submitting a standard HLA allele.\n'
-        #    + 'HLA alleles are assumed to be fully sequenced, including 5\' and 3\' UTRs.\n'
-        #    + 'Use capital letters for exons, lowercase for introns & UTRs, like this:\n'
-        #    + 'five\'utr EXON1 intron1 EXON2 ... EXON{X} three\'utr\n'
-        #    + 'All spaces, tabs, and newlines are discarded and ignored.')
+        self.constructSubmission()
         
+    # This method should popup some instruction text in a wee window.
+    # This should be explicit on how to use the tool.    
+    def howToUse(self):
         tkMessageBox.showinfo('How to use this tool',
             'This software is to be used to create an\n'
             + 'EMBL-formatted submission document,\n'
-            + 'which specifies a novel HLA allele,\n'
-            + 'including exon/intron annotation.\n\n'       
+            + 'which specifies a (novel) HLA allele.\n\n'       
                        
-            + 'This tool assumes you are submitting a\n'
-            + 'full length HLA allele.\n'
-            + 'HLA alleles should be fully sequenced,\n'
-            + 'including 5\' and 3\' UTRs.\n'
+            + 'This tool requires you to submit a\n'
+            + 'full length HLA allele, including\n'
+            + '5\' and 3\' UTRs.\n\n'
+            
             + 'Use capital letters for exons,\n'
             + 'lowercase for introns & UTRs.\n\n'
             
-            + 'Paste your formatted sequence in the\n'
-            + 'first text area, and push the button\n'
-            + 'to generate a submission.\n\n'     
-            
-            + 'You can copy the submission from the GUI\n'
-            + 'or save it as text to your computer.\n\n'        
-            
-            + 'An example is included in the form,\n'
+            + 'Push the "Example Sequence" button to see a small example of'
+            + ' a formatted sequence.\n'
             + 'Sequences should follow this pattern:\n'
             + '5\'utr EX1 int1 EX2 ... EX{X} 3\'utr\n\n'
             
-            + 'All spaces, tabs, and newlines are\n'
-            + 'removed and ignored.'
+            + 'To use this tool:\n'
+            + '1.) Fill in a Sample ID, Gene Name, and Allele.'
+            + ' This text will be included in the submission.\n'
+            + '2.) Paste your formatted sequence in the\n'
+            + 'Annotated Sequence text area.\n'
+            + '3.) Push \"Generate an EMBL submission\" button'
+            + ' to generate a submission.\n'
+            + '4.) Push the "Save the submission" button'
+            + ' to store the submission on your computer.\nYou can submit this file to EMBL.\n\n'
+            
+            + 'All spaces, tabs, and newlines are'
+            + ' removed before the nucleotide sequence is translated.'
             )
         
     def contactInformation(self):
@@ -236,12 +222,13 @@ def saveSubmissionFile(self):
         options['initialdir'] = self.idir
         options['parent'] = self
         options['title'] = 'Specify your output file.'
+        options['initialfile'] = 'NovelAlleleEMBLSubmission.txt'
         outputFileObject = tkFileDialog.asksaveasfile(**self.dir_opt)
         submissionText = self.submOutputGuiObject.get('1.0', 'end')
         outputFileObject.write(submissionText)
         
     # Gather sequence information from the input elements, and generate a text EMBL submission.
-    def updateGUI(self):
+    def constructSubmission(self):
 
         allGen = AlleleGenerator()
         roughFeatureSequence = self.featureInputGuiObject.get('1.0', 'end')
diff --git a/src/AlleleSubmissionEMBL.py b/src/AlleleSubmissionEMBL.py
index 19beb9b..7df558b 100755
--- a/src/AlleleSubmissionEMBL.py
+++ b/src/AlleleSubmissionEMBL.py
@@ -36,7 +36,7 @@
             AlleleGui(root).pack()
             root.mainloop()
 
-            print('Done.  Yay.')
+            print('Done.  Hooray.')
 
         # Print the Software Version
         elif (len(sys.argv) == 2 and (

From 6e23e0e330ad60fdaa86f71864c68227d232c57c Mon Sep 17 00:00:00 2001
From: Ben <ben.matern@gmail.com>
Date: Tue, 23 May 2017 17:39:19 +0200
Subject: [PATCH 3/3] Corrected software title

---
 README.md                   | 3 ++-
 src/AlleleGui.py            | 4 ++--
 src/AlleleSubmissionEMBL.py | 2 +-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 70b453d..780d178 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,5 @@
-#EMBL-HLA-SUBMISSION
+#Bhast
+Ben's HLA Allele Submission Tool
 A tool for generating an EMBL-formatted submission of a standard novel HLA allele. 
 
 ##Download the executable
diff --git a/src/AlleleGui.py b/src/AlleleGui.py
index ece5473..1a99985 100755
--- a/src/AlleleGui.py
+++ b/src/AlleleGui.py
@@ -14,7 +14,7 @@
 # along with EMBL-HLA-Submission. If not, see <http://www.gnu.org/licenses/>.
 
 # Version 1.0 
-SoftwareVersion = "EMBL-HLA-Submission Version 1.0"
+SoftwareVersion = "Bhast Version 1.0"
 
 import os
 
@@ -30,7 +30,7 @@ class AlleleGui(Tkinter.Frame):
     # Initialize the GUI
     def __init__(self, root):
         Tkinter.Frame.__init__(self, root)
-        root.title("AlleleGen - A Novel HLA Allele Submission Generator")
+        root.title("Bhast - A Novel HLA Allele Submission Generator")
         self.parent = root
         
         # Ctrl-A doesn't work by default in TK.  I guess I need to do it myself.
diff --git a/src/AlleleSubmissionEMBL.py b/src/AlleleSubmissionEMBL.py
index 7df558b..6070ddb 100755
--- a/src/AlleleSubmissionEMBL.py
+++ b/src/AlleleSubmissionEMBL.py
@@ -15,7 +15,7 @@
 
 # Version 1.0 
 
-SoftwareVersion = "EMBL-HLA-Submission Version 1.0"
+SoftwareVersion = "Bhast Version 1.0"
 
 import Tkinter
 import sys