diff --git a/data/sample.json b/data/sample.json new file mode 100644 index 00000000..f7423588 --- /dev/null +++ b/data/sample.json @@ -0,0 +1,193 @@ +{ + "Meta": { + "Name": "", + "GffVersion": "", + "RegionStart": 0, + "RegionEnd": 0, + "Size": 0, + "Type": "", + "GenbankDivision": "", + "Date": "", + "Definition": "Saccharomyces cerevisiae TCP1-beta gene, partial cds, and Axl2p (AXL2) and Rev7p (REV7) genes, complete cds.", + "Accession": "U49845", + "Version": "U49845.1 GI:1293613", + "Keywords": ".", + "Organism": "Saccharomyces cerevisiae Eukaryota; Fungi; Ascomycota; Saccharomycotina; Saccharomycetes; Saccharomycetales; Saccharomycetaceae; Saccharomyces.", + "Source": "Saccharomyces cerevisiae (baker's yeast)", + "Origin": "", + "Locus": { + "Name": "SCU49845", + "SequenceLength": "5028", + "MoleculeType": "DNA", + "GenBankDivision": "PLN", + "ModDate": "21-JUN-1999", + "SequenceCoding": "bp", + "Circular": false + }, + "References": [ + { + "Index": "1", + "Authors": "Torpey,L.E., Gibbs,P.E., Nelson,J. and Lawrence,C.W.", + "Title": "Cloning and sequence of REV7, a gene whose function is required for DNA damage-induced mutagenesis in Saccharomyces cerevisiae", + "Journal": "Yeast 10 (11), 1503-1509 (1994)", + "PubMed": "7871890", + "Remark": "", + "Range": "(bases 1 to 5028)" + }, + { + "Index": "2", + "Authors": "Roemer,T., Madden,K., Chang,J. and Snyder,M.", + "Title": "Selection of axial growth sites in yeast requires Axl2p, a novel plasma membrane glycoprotein", + "Journal": "Genes Dev. 10 (7), 777-793 (1996)", + "PubMed": "8846915", + "Remark": "", + "Range": "(bases 1 to 5028)" + }, + { + "Index": "3", + "Authors": "Roemer,T.", + "Title": "Direct Submission", + "Journal": "Submitted (22-FEB-1996) Terry Roemer, Biology, Yale University, New Haven, CT, USA", + "PubMed": "", + "Remark": "", + "Range": "(bases 1 to 5028)" + } + ], + "Primaries": null + }, + "Features": [ + { + "Name": "", + "Source": "", + "Type": "source", + "Start": 1, + "End": 5028, + "Complement": false, + "FivePrimePartial": false, + "ThreePrimePartial": false, + "Score": "", + "Strand": "", + "Phase": "", + "Attributes": { + "chromosome": "IX", + "db_xref": "taxon:4932", + "map": "9", + "organism": "Saccharomyces cerevisiae" + }, + "Location": "1..5028", + "Sequence": "" + }, + { + "Name": "", + "Source": "", + "Type": "CDS", + "Start": 1, + "End": 206, + "Complement": false, + "FivePrimePartial": true, + "ThreePrimePartial": false, + "Score": "", + "Strand": "", + "Phase": "", + "Attributes": { + "codon_start": "3", + "db_xref": "GI:1293614", + "product": "TCP1-beta", + "protein_id": "AAA98665.1", + "translation": "SSIYNGISTSGLDLNNGTIADMRQLGIVESYKLKRAVVSSASEAAEVLLRVDNIIRARPRTANRQHM" + }, + "Location": "\u003c1..206", + "Sequence": "" + }, + { + "Name": "", + "Source": "", + "Type": "gene", + "Start": 687, + "End": 3158, + "Complement": false, + "FivePrimePartial": false, + "ThreePrimePartial": false, + "Score": "", + "Strand": "", + "Phase": "", + "Attributes": { + "gene": "AXL2" + }, + "Location": "687..3158", + "Sequence": "" + }, + { + "Name": "", + "Source": "", + "Type": "CDS", + "Start": 687, + "End": 3158, + "Complement": false, + "FivePrimePartial": false, + "ThreePrimePartial": false, + "Score": "", + "Strand": "", + "Phase": "", + "Attributes": { + "codon_start": "1", + "db_xref": "GI:1293615", + "function": "required for axial budding pattern of S.cerevisiae", + "gene": "AXL2", + "note": "plasma membrane glycoprotein", + "product": "Axl2p", + "protein_id": "AAA98666.1", + "translation": "MTQLQISLLLTATISLLHLVVATPYEAYPIGKQYPPVARVNESFTFQISNDTYKSSVDKTAQITYNCFDLPSWLSFDSSSRTFSGEPSSDLLSDANTTLYFNVILEGTDSADSTSLNNTYQFVVTNRPSISLSSDFNLLALLKNYGYTNGKNALKLDPNEVFNVTFDRSMFTNEESIVSYYGRSQLYNAPLPNWLFFDSGELKFTGTAPVINSAIAPETSYSFVIIATDIEGFSAVEVEFELVIGAHQLTTSIQNSLIINVTDTGNVSYDLPLNYVYLDDDPISSDKLGSINLLDAPDWVALDNATISGSVPDELLGKNSNPANFSVSIYDTYGDVIYFNFEVVSTTDLFAISSLPNINATRGEWFSYYFLPSQFTDYVNTNVSLEFTNSSQDHDWVKFQSSNLTLAGEVPKNFDKLSLGLKANQGSQSQELYFNIIGMDSKITHSNHSANATSTRSSHHSTSTSSYTSSTYTAKISSTSAAATSSAPAALPAANKTSSHNKKAVAIACGVAIPLGVILVALICFLIFWRRRRENPDDENLPHAISGPDLNNPANKPNQENATPLNNPFDDDASSYDDTSIARRLAALNTLKLDNHSATESDISSVDEKRDSLSGMNTYNDQFQSQSKEELLAKPPVQPPESPFFDPQNRSSSVYMDSEPAVNKSWRYTGNLSPVSDIVRDSYGSQKTVDTEKLFDLEAPEKEKRTSRDVTMSSLDPWNSNISPSPVRKSVTPSPYNVTKHRNRHLQNIQDSQSGKNGITPTTMSTSSSDDFVPVKDGENFCWVHSMEPDRRPSKKRLVDFSNKSNVNVGQVKDIHGRIPEML" + }, + "Location": "687..3158", + "Sequence": "" + }, + { + "Name": "", + "Source": "", + "Type": "gene", + "Start": 3300, + "End": 4037, + "Complement": true, + "FivePrimePartial": false, + "ThreePrimePartial": false, + "Score": "", + "Strand": "", + "Phase": "", + "Attributes": { + "gene": "REV7" + }, + "Location": "complement(3300..4037)", + "Sequence": "" + }, + { + "Name": "", + "Source": "", + "Type": "CDS", + "Start": 3300, + "End": 4037, + "Complement": true, + "FivePrimePartial": false, + "ThreePrimePartial": false, + "Score": "", + "Strand": "", + "Phase": "", + "Attributes": { + "codon_start": "1", + "db_xref": "GI:1293616", + "gene": "REV7", + "product": "Rev7p", + "protein_id": "AAA98667.1", + "translation": "MNRWVEKWLRVYLKCYINLILFYRNVYPPQSFDYTTYQSFNLPQFVPINRHPALIDYIEELILDVLSKLTHVYRFSICIINKKNDLCIEKYVLDFSELQHVDKDDQIITETEVFDEFRSSLNSLIMHLEKLPKVNDDTITFEAVINAIELELGHKLDRNRRVDSLEEKAEIERDSNWVKCQEDENLPDNNGFQPPKIKLTSLVGSDVGPLIIHQFSEKLISGDDKILNGVYSQYEEGESIFGSLF" + }, + "Location": "complement(3300..4037)", + "Sequence": "" + } + ], + "Sequence": { + "Description": "", + "Hash": "", + "HashFunction": "", + "Sequence": "gatcctccatatacaacggtatctccacctcaggtttagatctcaacaacggaaccattgccgacatgagacagttaggtatcgtcgagagttacaagctaaaacgagcagtagtcagctctgcatctgaagccgctgaagttctactaagggtggataacatcatccgtgcaagaccaagaaccgccaatagacaacatatgtaacatatttaggatatacctcgaaaataataaaccgccacactgtcattattataattagaaacagaacgcaaaaattatccactatataattcaaagacgcgaaaaaaaaagaacaacgcgtcatagaacttttggcaattcgcgtcacaaataaattttggcaacttatgtttcctcttcgagcagtactcgagccctgtctcaagaatgtaataatacccatcgtaggtatggttaaagatagcatctccacaacctcaaagctccttgccgagagtcgccctcctttgtcgagtaattttcacttttcatatgagaacttattttcttattctttactctcacatcctgtagtgattgacactgcaacagccaccatcactagaagaacagaacaattacttaatagaaaaattatatcttcctcgaaacgatttcctgcttccaacatctacgtatatcaagaagcattcacttaccatgacacagcttcagatttcattattgctgacagctactatatcactactccatctagtagtggccacgccctatgaggcatatcctatcggaaaacaataccccccagtggcaagagtcaatgaatcgtttacatttcaaatttccaatgatacctataaatcgtctgtagacaagacagctcaaataacatacaattgcttcgacttaccgagctggctttcgtttgactctagttctagaacgttctcaggtgaaccttcttctgacttactatctgatgcgaacaccacgttgtatttcaatgtaatactcgagggtacggactctgccgacagcacgtctttgaacaatacataccaatttgttgttacaaaccgtccatccatctcgctatcgtcagatttcaatctattggcgttgttaaaaaactatggttatactaacggcaaaaacgctctgaaactagatcctaatgaagtcttcaacgtgacttttgaccgttcaatgttcactaacgaagaatccattgtgtcgtattacggacgttctcagttgtataatgcgccgttacccaattggctgttcttcgattctggcgagttgaagtttactgggacggcaccggtgataaactcggcgattgctccagaaacaagctacagttttgtcatcatcgctacagacattgaaggattttctgccgttgaggtagaattcgaattagtcatcggggctcaccagttaactacctctattcaaaatagtttgataatcaacgttactgacacaggtaacgtttcatatgacttacctctaaactatgtttatctcgatgacgatcctatttcttctgataaattgggttctataaacttattggatgctccagactgggtggcattagataatgctaccatttccgggtctgtcccagatgaattactcggtaagaactccaatcctgccaatttttctgtgtccatttatgatacttatggtgatgtgatttatttcaacttcgaagttgtctccacaacggatttgtttgccattagttctcttcccaatattaacgctacaaggggtgaatggttctcctactattttttgccttctcagtttacagactacgtgaatacaaacgtttcattagagtttactaattcaagccaagaccatgactgggtgaaattccaatcatctaatttaacattagctggagaagtgcccaagaatttcgacaagctttcattaggtttgaaagcgaaccaaggttcacaatctcaagagctatattttaacatcattggcatggattcaaagataactcactcaaaccacagtgcgaatgcaacgtccacaagaagttctcaccactccacctcaacaagttcttacacatcttctacttacactgcaaaaatttcttctacctccgctgctgctacttcttctgctccagcagcgctgccagcagccaataaaacttcatctcacaataaaaaagcagtagcaattgcgtgcggtgttgctatcccattaggcgttatcctagtagctctcatttgcttcctaatattctggagacgcagaagggaaaatccagacgatgaaaacttaccgcatgctattagtggacctgatttgaataatcctgcaaataaaccaaatcaagaaaacgctacacctttgaacaacccctttgatgatgatgcttcctcgtacgatgatacttcaatagcaagaagattggctgctttgaacactttgaaattggataaccactctgccactgaatctgatatttccagcgtggatgaaaagagagattctctatcaggtatgaatacatacaatgatcagttccaatcccaaagtaaagaagaattattagcaaaacccccagtacagcctccagagagcccgttctttgacccacagaataggtcttcttctgtgtatatggatagtgaaccagcagtaaataaatcctggcgatatactggcaacctgtcaccagtctctgatattgtcagagacagttacggatcacaaaaaactgttgatacagaaaaacttttcgatttagaagcaccagagaaggaaaaacgtacgtcaagggatgtcactatgtcttcactggacccttggaacagcaatattagcccttctcccgtaagaaaatcagtaacaccatcaccatataacgtaacgaagcatcgtaaccgccacttacaaaatattcaagactctcaaagcggtaaaaacggaatcactcccacaacaatgtcaacttcatcttctgacgattttgttccggttaaagatggtgaaaatttttgctgggtccatagcatggaaccagacagaagaccaagtaagaaaaggttagtagatttttcaaataagagtaatgtcaatgttggtcaagttaaggacattcacggacgcatcccagaaatgctgtgattatacgcaacgatattttgcttaattttattttcctgttttattttttattagtggtttacagataccctatattttatttagtttttatacttagagacatttaattttaattccattcttcaaatttcatttttgcacttaaaacaaagatccaaaaatgctctcgccctcttcatattgagaatacactccattcaaaattttgtcgtcaccgctgattaatttttcactaaactgatgaataatcaaaggccccacgtcagaaccgactaaagaagtgagttttattttaggaggttgaaaaccattattgtctggtaaattttcatcttcttgacatttaacccagtttgaatccctttcaatttctgctttttcctccaaactatcgaccctcctgtttctgtccaacttatgtcctagttccaattcgatcgcattaataactgcttcaaatgttattgtgtcatcgttgactttaggtaatttctccaaatgcataatcaaactatttaaggaagatcggaattcgtcgaacacttcagtttccgtaatgatctgatcgtctttatccacatgttgtaattcactaaaatctaaaacgtatttttcaatgcataaatcgttctttttattaataatgcagatggaaaatctgtaaacgtgcgttaatttagaaagaacatccagtataagttcttctatatagtcaattaaagcaggatgcctattaatgggaacgaactgcggcaagttgaatgactggtaagtagtgtagtcgaatgactgaggtgggtatacatttctataaaataaaatcaaattaatgtagcattttaagtataccctcagccacttctctacccatctattcataaagctgacgcaacgattactattttttttttcttcttggatctcagtcgtcgcaaaaacgtataccttctttttccgaccttttttttagctttctggaaaagtttatattagttaaacagggtctagtcttagtgtgaaagctagtggtttcgattgactgatattaagaaagtggaaattaaattagtagtgtagacgtatatgcatatgtatttctcgcctgtttatgtttctacgtacttttgatttatagcaaggggaaaagaaatacatactattttttggtaaaggtgaaagcataatgtaaaagctagaataaaatggacgaaataaagagaggcttagttcatcttttttccaaaaagcacccaatgataataactaaaatgaaaaggatttgccatctgtcagcaacatcagttgtgtgagcaataataaaatcatcacctccgttgcctttagcgcgtttgtcgtttgtatcttccgtaattttagtcttatcaatgggaatcataaattttccaatgaattagcaatttcgtccaattctttttgagcttcttcatatttgctttggaattcttcgcacttcttttcccattcatctctttcttcttccaaagcaacgatccttctacccatttgctcagagttcaaatcggcctctttcagtttatccattgcttccttcagtttggcttcactgtcttctagctgttgttctagatcctggtttttcttggtgtagttctcattattagatctcaagttattggagtcttcagccaattgctttgtatcagacaattgactctctaacttctccacttcactgtcgagttgctcgtttttagcggacaaagatttaatctcgttttctttttcagtgttagattgctctaattctttgagctgttctctcagctcctcatatttttcttgccatgactcagattctaattttaagctattcaatttctctttgatc" + } +} \ No newline at end of file diff --git a/hash.go b/hash.go index 475f9187..f2a55818 100644 --- a/hash.go +++ b/hash.go @@ -36,10 +36,11 @@ import ( // BLAKE2b_384 // import golang.org/x/crypto/blake2b // BLAKE2b_512 // import golang.org/x/crypto/blake2b -// BoothLeastRotation gets the least rotation of a circular string. -// https://en.wikipedia.org/wiki/Lexicographically_minimal_string_rotation -// this is generally over commented but I'm keeping it this way for now. - Tim -func BoothLeastRotation(sequence string) int { +// boothLeastRotation gets the least rotation of a circular string. +func boothLeastRotation(sequence string) int { + + // https://en.wikipedia.org/wiki/Lexicographically_minimal_string_rotation + // this is generally over commented but I'm keeping it this way for now. - Tim // first concatenate the sequence to itself to avoid modular arithmateic sequence += sequence // maybe do this as a buffer just for speed? May get annoying with larger sequences. @@ -89,8 +90,14 @@ func BoothLeastRotation(sequence string) int { // RotateSequence rotates circular sequences to deterministic point. func RotateSequence(sequence string) string { - rotationIndex := BoothLeastRotation(sequence) - concatenatedSequence := sequence + sequence + rotationIndex := boothLeastRotation(sequence) + var sequenceBuilder strings.Builder + + // writing the same sequence twice. using build incase of very long circular genome. + sequenceBuilder.WriteString(sequence) + sequenceBuilder.WriteString(sequence) + + concatenatedSequence := sequenceBuilder.String() sequence = concatenatedSequence[rotationIndex : rotationIndex+len(sequence)] return sequence } diff --git a/hash_test.go b/hash_test.go index 44dff797..9a8a4069 100644 --- a/hash_test.go +++ b/hash_test.go @@ -9,6 +9,13 @@ import ( "lukechampine.com/blake3" ) +func ExampleHash() { + puc19 := ReadGbk("data/puc19.gbk") + fmt.Println(puc19.Hash(blake3.New(32, nil))) // passing new hash.Hash struct to Hasher + + // output: 4b0616d1b3fc632e42d78521deb38b44fba95cca9fde159e01cd567fa996ceb9 +} + func TestHashRegression(t *testing.T) { puc19GbkBlake3Hash := "4b0616d1b3fc632e42d78521deb38b44fba95cca9fde159e01cd567fa996ceb9" puc19 := ReadGbk("data/puc19.gbk") @@ -28,6 +35,15 @@ func TestHashRegression(t *testing.T) { } } +func ExampleRotateSequence() { + sequence := ReadGbk("data/puc19.gbk") + sequenceLength := len(sequence.Sequence) + testSequence := sequence.Sequence[sequenceLength/2:] + sequence.Sequence[0:sequenceLength/2] + + fmt.Println(RotateSequence(sequence.Sequence) == RotateSequence(testSequence)) + // output: true +} + func TestLeastRotation(t *testing.T) { sequence := ReadGbk("data/puc19.gbk") var sequenceBuffer bytes.Buffer diff --git a/io.go b/io.go index 39a9c8ff..d7aad795 100644 --- a/io.go +++ b/io.go @@ -139,7 +139,9 @@ GFF specific IO related things begin here. ******************************************************************************/ // ParseGff Takes in a string representing a gffv3 file and parses it into an Sequence object. -func ParseGff(gff string) Sequence { +func ParseGff(gffBytes []byte) Sequence { + + gff := string(gffBytes) sequence := Sequence{} lines := strings.Split(gff, "\n") @@ -327,7 +329,7 @@ func ReadGff(path string) Sequence { if err != nil { // return 0, fmt.Errorf("Failed to open file %s for unpack: %s", gzFilePath, err) } else { - sequence = ParseGff(string(file)) + sequence = ParseGff(file) } return sequence } @@ -392,8 +394,8 @@ FASTA specific IO related things begin here. ******************************************************************************/ // ParseFASTA parses a Sequence struct from a FASTA file and adds appropriate pointers to the structs. -func ParseFASTA(fasta string) Sequence { - +func ParseFASTA(fastaBytes []byte) Sequence { + fasta := string(fastaBytes) var sequence Sequence var feature Feature var features []Feature @@ -500,8 +502,8 @@ func ReadFASTA(path string) Sequence { if err != nil { // return 0, fmt.Errorf("Failed to open file %s for unpack: %s", gzFilePath, err) } - annotatedSequenceArray := ParseFASTA(string(file)) - return annotatedSequenceArray + sequence := ParseFASTA(file) + return sequence } // WriteFASTA writes a Sequence struct out to FASTA. @@ -522,8 +524,9 @@ GBK specific IO related things begin here. ******************************************************************************/ // ParseGbk takes in a string representing a gbk/gb/genbank file and parses it into an Sequence object. -func ParseGbk(gbk string) Sequence { +func ParseGbk(gbkBytes []byte) Sequence { + gbk := string(gbkBytes) lines := strings.Split(gbk, "\n") // Create meta struct @@ -716,8 +719,7 @@ func ReadGbk(path string) Sequence { if err != nil { // return 0, fmt.Errorf("Failed to open file %s for unpack: %s", gzFilePath, err) } else { - gbkString := string(file) - sequence = ParseGbk(gbkString) + sequence = ParseGbk(file) } return sequence diff --git a/io_test.go b/io_test.go index 167f93fd..a29de128 100644 --- a/io_test.go +++ b/io_test.go @@ -29,6 +29,45 @@ Gff related tests and benchmarks begin here. ******************************************************************************/ +func ExampleReadGff() { + + sequence := ReadGff("data/ecoli-mg1655.gff") + fmt.Println(sequence.Meta.Name) + // Output: U00096.3 +} + +func ExampleParseGff() { + file, _ := ioutil.ReadFile("data/ecoli-mg1655.gff") + sequence := ParseGff(file) + + fmt.Println(sequence.Meta.Name) + // Output: U00096.3 +} + +func ExampleBuildGff() { + + sequence := ReadGff("data/ecoli-mg1655.gff") + gffBytes := BuildGff(sequence) + reparsedSequence := ParseGff(gffBytes) + + fmt.Println(reparsedSequence.Meta.Name) + // Output: U00096.3 + +} + +func ExampleWriteGff() { + sequence := ReadGff("data/ecoli-mg1655.gff") + WriteGff(sequence, "data/test.gff") + testSequence := ReadGff("data/test.gff") + + os.Remove("data/test.gff") + + fmt.Println(testSequence.Meta.Name) + // Output: U00096.3 +} + +// TODO should delete output files. + func TestGffIO(t *testing.T) { testInputPath := "data/ecoli-mg1655.gff" testOutputPath := "data/test.gff" @@ -65,7 +104,7 @@ func TestGffIO(t *testing.T) { func BenchmarkReadGff(b *testing.B) { for i := 0; i < b.N; i++ { - ParseGff("data/ecoli-mg1655.gff") + ReadGff("data/ecoli-mg1655.gff") } } @@ -87,6 +126,40 @@ Gbk/gb/genbank related benchmarks begin here. ******************************************************************************/ +func ExampleReadGbk() { + sequence := ReadGbk("data/puc19.gbk") + fmt.Println(sequence.Meta.Locus.ModificationDate) + // Output: 22-OCT-2019 +} + +func ExampleParseGbk() { + file, _ := ioutil.ReadFile("data/puc19.gbk") + sequence := ParseGbk(file) + + fmt.Println(sequence.Meta.Locus.ModificationDate) + // Output: 22-OCT-2019 +} + +func ExampleBuildGbk() { + sequence := ReadGbk("data/puc19.gbk") + gbkBytes := BuildGbk(sequence) + testSequence := ParseGbk(gbkBytes) + + fmt.Println(testSequence.Meta.Locus.ModificationDate) + // Output: 22-OCT-2019 +} + +func ExampleWriteGbk() { + sequence := ReadGbk("data/puc19.gbk") + WriteGbk(sequence, "data/test.gbk") + testSequence := ReadGbk("data/test.gbk") + + os.Remove("data/test.gbk") + + fmt.Println(testSequence.Meta.Locus.ModificationDate) + // Output: 22-OCT-2019 +} + func TestGbkIO(t *testing.T) { gbk := ReadGbk("data/puc19.gbk") WriteGbk(gbk, "data/puc19gbktest.gbk") @@ -202,6 +275,32 @@ JSON related tests begin here. ******************************************************************************/ +func ExampleReadJSON() { + sequence := ReadJSON("data/sample.json") + + fmt.Println(sequence.Meta.Source) + //output: Saccharomyces cerevisiae (baker's yeast) +} + +func ExampleParseJSON() { + file, _ := ioutil.ReadFile("data/sample.json") + sequence := ParseJSON(file) + + fmt.Println(sequence.Meta.Source) + //output: Saccharomyces cerevisiae (baker's yeast) +} + +func ExampleWriteJSON() { + sequence := ReadJSON("data/sample.json") + WriteJSON(sequence, "data/test.json") + testSequence := ReadJSON("data/test.json") + + os.Remove("data/test.json") + + fmt.Println(testSequence.Meta.Source) + //output: Saccharomyces cerevisiae (baker's yeast) +} + func TestJSONIO(t *testing.T) { testSequence := ReadGbk("data/bsub.gbk") WriteJSON(testSequence, "data/test.json") @@ -248,7 +347,7 @@ func ExampleReadFASTA() { func ExampleParseFASTA() { file, _ := ioutil.ReadFile("data/base.fasta") - sequence := ParseFASTA(string(file)) + sequence := ParseFASTA(file) fmt.Println(sequence.Features[0].Description) // Output: gi|5524211|gb|AAD44166.1| cytochrome b [Elephas maximus maximus] diff --git a/poly/commands.go b/poly/commands.go index 0e6d18b8..0abd7227 100644 --- a/poly/commands.go +++ b/poly/commands.go @@ -292,9 +292,7 @@ func optimizeCommand(c *cli.Context) error { if isNumeric(c.String("ct")) { codonTableNumber, _ := strconv.Atoi(c.String("ct")) - codonTable = poly.DefaultCodonTablesByNumber[codonTableNumber] - } else { - codonTable = poly.DefaultCodonTablesByName[c.String("ct")] + codonTable = poly.GetCodonTable(codonTableNumber) } // if a file exists to weigh the table. Weigh it. @@ -347,9 +345,7 @@ func translateCommand(c *cli.Context) error { if isNumeric(c.String("ct")) { codonTableNumber, _ := strconv.Atoi(c.String("ct")) - codonTable = poly.DefaultCodonTablesByNumber[codonTableNumber] - } else { - codonTable = poly.DefaultCodonTablesByName[c.String("ct")] + codonTable = poly.GetCodonTable(codonTableNumber) } // uncomment below to parse sequence from pipe @@ -364,7 +360,7 @@ func translateCommand(c *cli.Context) error { } // a simple helper function to convert an *os.File type into a string. -func stdinToString(file io.Reader) string { +func stdinToBytes(file io.Reader) []byte { var stringBuffer bytes.Buffer reader := bufio.NewReader(file) for { @@ -374,7 +370,7 @@ func stdinToString(file io.Reader) string { } stringBuffer.WriteRune(input) } - return stringBuffer.String() + return stringBuffer.Bytes() } // a simple helper function to remove duplicates from a list of strings. @@ -420,13 +416,13 @@ func parseStdin(c *cli.Context) poly.Sequence { var sequence poly.Sequence // logic for determining input format, then parses accordingly. if c.String("i") == "json" { - json.Unmarshal([]byte(stdinToString(c.App.Reader)), &sequence) + json.Unmarshal([]byte(stdinToBytes(c.App.Reader)), &sequence) } else if c.String("i") == "gbk" || c.String("i") == "gb" { - sequence = poly.ParseGbk(stdinToString(c.App.Reader)) + sequence = poly.ParseGbk(stdinToBytes(c.App.Reader)) } else if c.String("i") == "gff" { - sequence = poly.ParseGff(stdinToString(c.App.Reader)) + sequence = poly.ParseGff(stdinToBytes(c.App.Reader)) } else if c.String("i") == "string" { - sequence.Sequence = stdinToString(c.App.Reader) + sequence.Sequence = string(stdinToBytes(c.App.Reader)) } return sequence } diff --git a/poly/commands_test.go b/poly/commands_test.go index c501ea3e..708e2752 100644 --- a/poly/commands_test.go +++ b/poly/commands_test.go @@ -200,7 +200,7 @@ func TestOptimizeString(t *testing.T) { // should return codon optimized sequence optimizeOutputString := strings.TrimSpace(writeBuffer.String()) - translation := poly.Translate(optimizeOutputString, poly.DefaultCodonTablesByNumber[1]) + translation := poly.Translate(optimizeOutputString, poly.GetCodonTable(1)) if translation != gfpTranslation { t.Errorf("TestOptimizeCommand for string output has failed. Returned %q, want %q", translation, gfpTranslation) diff --git a/poly/main.go b/poly/main.go index 308489e1..be238a50 100644 --- a/poly/main.go +++ b/poly/main.go @@ -178,7 +178,7 @@ func application() *cli.App { &cli.StringFlag{ Name: "ct", Aliases: []string{"--codon-table"}, - Value: "Standard", + Value: "1", Usage: "Specify codon table used for organism. Defaults to Standard NCBI table.", }, &cli.BoolFlag{ diff --git a/primers_test.go b/primers_test.go index 5c560c53..c1f52927 100644 --- a/primers_test.go +++ b/primers_test.go @@ -1,10 +1,19 @@ package poly import ( + "fmt" "math" "testing" ) +func ExampleMarmurDoty() { + sequenceString := "ACGTCCGGACTT" + meltingTemp := MarmurDoty(sequenceString) + + fmt.Println(meltingTemp) + // output: 31 +} + func TestMarmurDoty(t *testing.T) { testSeq := "ACGTCCGGACTT" expectedTM := 31.0 @@ -13,6 +22,19 @@ func TestMarmurDoty(t *testing.T) { } } +func ExampleSantaLucia() { + + sequenceString := "ACGATGGCAGTAGCATGC" //"GTAAAACGACGGCCAGT" // M13 fwd + testCPrimer := 0.1e-6 // primer concentration + testCNa := 350e-3 // salt concentration + testCMg := 0.0 // magnesium concentration + expectedTM := 62.7 // roughly what we're expecting with a margin of error + meltingTemp, _, _ := SantaLucia(sequenceString, testCPrimer, testCNa, testCMg) + withinMargin := math.Abs(expectedTM-meltingTemp)/expectedTM >= 0.02 // checking margin of error + + fmt.Println(withinMargin) + // output: false +} func TestSantaLucia(t *testing.T) { testSeq := "ACGATGGCAGTAGCATGC" //"GTAAAACGACGGCCAGT" // M13 fwd testCPrimer := 0.1e-6 @@ -24,7 +46,17 @@ func TestSantaLucia(t *testing.T) { } } -func TestCalcTM(t *testing.T) { +func ExampleMeltingTemp() { + sequenceString := "GTAAAACGACGGCCAGT" // M13 fwd + expectedTM := 52.8 + meltingTemp := MeltingTemp(sequenceString) + withinMargin := math.Abs(expectedTM-meltingTemp)/expectedTM >= 0.02 + + fmt.Println(withinMargin) + // output: false +} + +func TestMeltingTemp(t *testing.T) { testSeq := "GTAAAACGACGGCCAGT" // M13 fwd expectedTM := 52.8 if calcTM := MeltingTemp(testSeq); math.Abs(expectedTM-calcTM)/expectedTM >= 0.02 { diff --git a/sequence.go b/sequence.go index efe1cdfc..33297625 100644 --- a/sequence.go +++ b/sequence.go @@ -5,8 +5,8 @@ import ( "strings" ) -// ComplementBaseRuneMap provides 1:1 mapping between bases and their complements -var ComplementBaseRuneMap = map[rune]rune{ +// complementBaseRuneMap provides 1:1 mapping between bases and their complements +var complementBaseRuneMap = map[rune]rune{ 65: 84, // A -> T 66: 86, // B -> V 67: 71, // C -> G @@ -65,7 +65,7 @@ func ReverseComplement(sequence string) string { // ComplementBase accepts a base pair and returns its complement base pair func ComplementBase(basePair rune) rune { - return ComplementBaseRuneMap[basePair] + return complementBaseRuneMap[basePair] } // getFeatureSequence takes a feature and location object and returns a sequence string. diff --git a/transformations.go b/transformations.go index 5160e8a7..dc232a91 100644 --- a/transformations.go +++ b/transformations.go @@ -116,7 +116,7 @@ func Optimize(aminoAcids string, codonTable CodonTable) string { func (codonTable CodonTable) CreateWeights(sequence string) CodonTable { sequence = strings.ToUpper(sequence) - codonFrequencyMap := GetCodonFrequency(sequence) + codonFrequencyMap := getCodonFrequency(sequence) for aminoAcidIndex, aminoAcid := range codonTable.AminoAcids { @@ -146,8 +146,8 @@ func (codonTable CodonTable) CreateWeights(sequence string) CodonTable { return codonTable } -// GetCodonFrequency takes a DNA sequence and returns a hashmap of its codons and their frequencies. -func GetCodonFrequency(sequence string) map[string]int { +// getCodonFrequency takes a DNA sequence and returns a hashmap of its codons and their frequencies. +func getCodonFrequency(sequence string) map[string]int { codonFrequencyHashMap := map[string]int{} var currentCodon strings.Builder @@ -285,8 +285,13 @@ func generateCodonTable(aminoAcids, starts string) CodonTable { return CodonTable{startCodons, stopCodons, aminoAcidSlice} } -// DefaultCodonTablesByNumber stores all codon tables published by NCBI https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi using numbered indeces. -var DefaultCodonTablesByNumber = map[int]CodonTable{ +// GetCodonTable takes the index of desired NCBI codon table and returns it. +func GetCodonTable(index int) CodonTable { + return defaultCodonTablesByNumber[index] +} + +// defaultCodonTablesByNumber stores all codon tables published by NCBI https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi using numbered indeces. +var defaultCodonTablesByNumber = map[int]CodonTable{ 1: generateCodonTable("FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", "---M------**--*----M---------------M----------------------------"), 2: generateCodonTable("FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG", "----------**--------------------MMMM----------**---M------------"), 3: generateCodonTable("FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG", "----------**----------------------MM---------------M------------"), @@ -314,7 +319,7 @@ var DefaultCodonTablesByNumber = map[int]CodonTable{ 33: generateCodonTable("FFLLSSSSYYY*CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSSKVVVVAAAADDEEGGGG", "---M-------*-------M---------------M---------------M------------")} // DefaultCodonTablesByName stores all codon tables published by NCBI https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi using named indeces. -var DefaultCodonTablesByName = map[string]CodonTable{ +var defaultCodonTablesByName = map[string]CodonTable{ "Standard": generateCodonTable("FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", "---M------**--*----M---------------M----------------------------"), // 1 "VertebrateMitochondrial": generateCodonTable("FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG", "----------**--------------------MMMM----------**---M------------"), // 2 "YeastMitochondrial": generateCodonTable("FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG", "----------**----------------------MM---------------M------------"), // 3 diff --git a/transformations_test.go b/transformations_test.go index e0362f4d..ab43047a 100644 --- a/transformations_test.go +++ b/transformations_test.go @@ -1,22 +1,49 @@ package poly import ( + "fmt" "strings" "testing" ) +func ExampleTranslate() { + + gfpTranslation := "MASKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFSYGVQCFSRYPDHMKRHDFFKSAMPEGYVQERTISFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYITADKQKNGIKANFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK*" + gfpDnaSequence := "ATGGCTAGCAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGAATTAGATGGTGATGTTAATGGGCACAAATTTTCTGTCAGTGGAGAGGGTGAAGGTGATGCTACATACGGAAAGCTTACCCTTAAATTTATTTGCACTACTGGAAAACTACCTGTTCCATGGCCAACACTTGTCACTACTTTCTCTTATGGTGTTCAATGCTTTTCCCGTTATCCGGATCATATGAAACGGCATGACTTTTTCAAGAGTGCCATGCCCGAAGGTTATGTACAGGAACGCACTATATCTTTCAAAGATGACGGGAACTACAAGACGCGTGCTGAAGTCAAGTTTGAAGGTGATACCCTTGTTAATCGTATCGAGTTAAAAGGTATTGATTTTAAAGAAGATGGAAACATTCTCGGACACAAACTCGAGTACAACTATAACTCACACAATGTATACATCACGGCAGACAAACAAAAGAATGGAATCAAAGCTAACTTCAAAATTCGCCACAACATTGAAGATGGATCCGTTCAACTAGCAGACCATTATCAACAAAATACTCCAATTGGCGATGGCCCTGTCCTTTTACCAGACAACCATTACCTGTCGACACAATCTGCCCTTTCGAAAGATCCCAACGAAAAGCGTGACCACATGGTCCTTCTTGAGTTTGTAACTGCTGCTGGGATTACACATGGCATGGATGAGCTCTACAAATAA" + testTranslation := Translate(gfpDnaSequence, GetCodonTable(11)) // need to specify which codons map to which amino acids per NCBI table + + fmt.Println(gfpTranslation == testTranslation) + // output: true +} + func TestTranslation(t *testing.T) { gfpTranslation := "MASKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFSYGVQCFSRYPDHMKRHDFFKSAMPEGYVQERTISFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYITADKQKNGIKANFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK*" gfpDnaSequence := "ATGGCTAGCAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGAATTAGATGGTGATGTTAATGGGCACAAATTTTCTGTCAGTGGAGAGGGTGAAGGTGATGCTACATACGGAAAGCTTACCCTTAAATTTATTTGCACTACTGGAAAACTACCTGTTCCATGGCCAACACTTGTCACTACTTTCTCTTATGGTGTTCAATGCTTTTCCCGTTATCCGGATCATATGAAACGGCATGACTTTTTCAAGAGTGCCATGCCCGAAGGTTATGTACAGGAACGCACTATATCTTTCAAAGATGACGGGAACTACAAGACGCGTGCTGAAGTCAAGTTTGAAGGTGATACCCTTGTTAATCGTATCGAGTTAAAAGGTATTGATTTTAAAGAAGATGGAAACATTCTCGGACACAAACTCGAGTACAACTATAACTCACACAATGTATACATCACGGCAGACAAACAAAAGAATGGAATCAAAGCTAACTTCAAAATTCGCCACAACATTGAAGATGGATCCGTTCAACTAGCAGACCATTATCAACAAAATACTCCAATTGGCGATGGCCCTGTCCTTTTACCAGACAACCATTACCTGTCGACACAATCTGCCCTTTCGAAAGATCCCAACGAAAAGCGTGACCACATGGTCCTTCTTGAGTTTGTAACTGCTGCTGGGATTACACATGGCATGGATGAGCTCTACAAATAA" - if got := Translate(gfpDnaSequence, DefaultCodonTablesByNumber[11]); got != gfpTranslation { + if got := Translate(gfpDnaSequence, GetCodonTable(11)); got != gfpTranslation { t.Errorf("TestTranslation has failed. Translate has returned %q, want %q", got, gfpTranslation) } } +func ExampleOptimize() { + + gfpTranslation := "MASKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFSYGVQCFSRYPDHMKRHDFFKSAMPEGYVQERTISFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYITADKQKNGIKANFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK*" + codonTable := GetCodonTable(11) + + sequence := ReadGbk("data/bsub.gbk") + rawSequence := sequence.Sequence + codonTable.CreateWeights(rawSequence) + + optimizedSequence := Optimize(gfpTranslation, codonTable) + optimizedSequenceTranslation := Translate(optimizedSequence, codonTable) + + fmt.Println(optimizedSequenceTranslation == gfpTranslation) + // output: true +} + func TestOptimize(t *testing.T) { gfpTranslation := "MASKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFSYGVQCFSRYPDHMKRHDFFKSAMPEGYVQERTISFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYITADKQKNGIKANFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK*" - codonTable := DefaultCodonTablesByNumber[11] + codonTable := GetCodonTable(11) sequence := ReadGbk("data/bsub.gbk") rawSequence := sequence.Sequence @@ -33,7 +60,7 @@ func TestOptimize(t *testing.T) { func TestGetCodonFrequency(t *testing.T) { - translationTable := DefaultCodonTablesByNumber[11].generateTranslationTable() + translationTable := GetCodonTable(11).generateTranslationTable() var codons strings.Builder @@ -51,7 +78,7 @@ func TestGetCodonFrequency(t *testing.T) { t.Errorf("TestGetCodonFrequency has failed. aggregrated codon string is not the correct length.") } - codonFrequencyHashMap := GetCodonFrequency(codonString) + codonFrequencyHashMap := getCodonFrequency(codonString) if len(codonFrequencyHashMap) != 64 { t.Errorf("TestGetCodonFrequency has failed. codonFrequencyHashMap does not contain every codon.") @@ -63,7 +90,7 @@ func TestGetCodonFrequency(t *testing.T) { } } - doubleCodonFrequencyHashMap := GetCodonFrequency(codonString + codonString) + doubleCodonFrequencyHashMap := getCodonFrequency(codonString + codonString) if len(doubleCodonFrequencyHashMap) != 64 { t.Errorf("TestGetCodonFrequency has failed. doubleCodonFrequencyHashMap does not contain every codon.")