Skip to content

Commit

Permalink
Refactor docs (#60)
Browse files Browse the repository at this point in the history
* added a couple of example functions to io_test.go.

* added Gff IO example tests.

* added gbk IO example tests.

* added JSON IO example tests.

* refactored parse functions to accept bytes instead of strings.

* adding sample json file for testing.

* added example tests for primer functions.

* make complementBaseRuneMap private.

* made defaultCodonTable maps private.

* made getCodonFrequency private.

* recommented BoothLeastRotation.

* added example test to hash_test.go.

* modified RotateSequence and made boothLeastRotation private.

* added example tests to translation_test.go.
  • Loading branch information
TimothyStiles authored Oct 31, 2020
1 parent 67bc453 commit 9cd47eb
Show file tree
Hide file tree
Showing 12 changed files with 423 additions and 46 deletions.
193 changes: 193 additions & 0 deletions data/sample.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
{
"Meta": {
"Name": "",
"GffVersion": "",
"RegionStart": 0,
"RegionEnd": 0,
"Size": 0,
"Type": "",
"GenbankDivision": "",
"Date": "",
"Definition": "Saccharomyces cerevisiae TCP1-beta gene, partial cds, and Axl2p (AXL2) and Rev7p (REV7) genes, complete cds.",
"Accession": "U49845",
"Version": "U49845.1 GI:1293613",
"Keywords": ".",
"Organism": "Saccharomyces cerevisiae Eukaryota; Fungi; Ascomycota; Saccharomycotina; Saccharomycetes; Saccharomycetales; Saccharomycetaceae; Saccharomyces.",
"Source": "Saccharomyces cerevisiae (baker's yeast)",
"Origin": "",
"Locus": {
"Name": "SCU49845",
"SequenceLength": "5028",
"MoleculeType": "DNA",
"GenBankDivision": "PLN",
"ModDate": "21-JUN-1999",
"SequenceCoding": "bp",
"Circular": false
},
"References": [
{
"Index": "1",
"Authors": "Torpey,L.E., Gibbs,P.E., Nelson,J. and Lawrence,C.W.",
"Title": "Cloning and sequence of REV7, a gene whose function is required for DNA damage-induced mutagenesis in Saccharomyces cerevisiae",
"Journal": "Yeast 10 (11), 1503-1509 (1994)",
"PubMed": "7871890",
"Remark": "",
"Range": "(bases 1 to 5028)"
},
{
"Index": "2",
"Authors": "Roemer,T., Madden,K., Chang,J. and Snyder,M.",
"Title": "Selection of axial growth sites in yeast requires Axl2p, a novel plasma membrane glycoprotein",
"Journal": "Genes Dev. 10 (7), 777-793 (1996)",
"PubMed": "8846915",
"Remark": "",
"Range": "(bases 1 to 5028)"
},
{
"Index": "3",
"Authors": "Roemer,T.",
"Title": "Direct Submission",
"Journal": "Submitted (22-FEB-1996) Terry Roemer, Biology, Yale University, New Haven, CT, USA",
"PubMed": "",
"Remark": "",
"Range": "(bases 1 to 5028)"
}
],
"Primaries": null
},
"Features": [
{
"Name": "",
"Source": "",
"Type": "source",
"Start": 1,
"End": 5028,
"Complement": false,
"FivePrimePartial": false,
"ThreePrimePartial": false,
"Score": "",
"Strand": "",
"Phase": "",
"Attributes": {
"chromosome": "IX",
"db_xref": "taxon:4932",
"map": "9",
"organism": "Saccharomyces cerevisiae"
},
"Location": "1..5028",
"Sequence": ""
},
{
"Name": "",
"Source": "",
"Type": "CDS",
"Start": 1,
"End": 206,
"Complement": false,
"FivePrimePartial": true,
"ThreePrimePartial": false,
"Score": "",
"Strand": "",
"Phase": "",
"Attributes": {
"codon_start": "3",
"db_xref": "GI:1293614",
"product": "TCP1-beta",
"protein_id": "AAA98665.1",
"translation": "SSIYNGISTSGLDLNNGTIADMRQLGIVESYKLKRAVVSSASEAAEVLLRVDNIIRARPRTANRQHM"
},
"Location": "\u003c1..206",
"Sequence": ""
},
{
"Name": "",
"Source": "",
"Type": "gene",
"Start": 687,
"End": 3158,
"Complement": false,
"FivePrimePartial": false,
"ThreePrimePartial": false,
"Score": "",
"Strand": "",
"Phase": "",
"Attributes": {
"gene": "AXL2"
},
"Location": "687..3158",
"Sequence": ""
},
{
"Name": "",
"Source": "",
"Type": "CDS",
"Start": 687,
"End": 3158,
"Complement": false,
"FivePrimePartial": false,
"ThreePrimePartial": false,
"Score": "",
"Strand": "",
"Phase": "",
"Attributes": {
"codon_start": "1",
"db_xref": "GI:1293615",
"function": "required for axial budding pattern of S.cerevisiae",
"gene": "AXL2",
"note": "plasma membrane glycoprotein",
"product": "Axl2p",
"protein_id": "AAA98666.1",
"translation": "MTQLQISLLLTATISLLHLVVATPYEAYPIGKQYPPVARVNESFTFQISNDTYKSSVDKTAQITYNCFDLPSWLSFDSSSRTFSGEPSSDLLSDANTTLYFNVILEGTDSADSTSLNNTYQFVVTNRPSISLSSDFNLLALLKNYGYTNGKNALKLDPNEVFNVTFDRSMFTNEESIVSYYGRSQLYNAPLPNWLFFDSGELKFTGTAPVINSAIAPETSYSFVIIATDIEGFSAVEVEFELVIGAHQLTTSIQNSLIINVTDTGNVSYDLPLNYVYLDDDPISSDKLGSINLLDAPDWVALDNATISGSVPDELLGKNSNPANFSVSIYDTYGDVIYFNFEVVSTTDLFAISSLPNINATRGEWFSYYFLPSQFTDYVNTNVSLEFTNSSQDHDWVKFQSSNLTLAGEVPKNFDKLSLGLKANQGSQSQELYFNIIGMDSKITHSNHSANATSTRSSHHSTSTSSYTSSTYTAKISSTSAAATSSAPAALPAANKTSSHNKKAVAIACGVAIPLGVILVALICFLIFWRRRRENPDDENLPHAISGPDLNNPANKPNQENATPLNNPFDDDASSYDDTSIARRLAALNTLKLDNHSATESDISSVDEKRDSLSGMNTYNDQFQSQSKEELLAKPPVQPPESPFFDPQNRSSSVYMDSEPAVNKSWRYTGNLSPVSDIVRDSYGSQKTVDTEKLFDLEAPEKEKRTSRDVTMSSLDPWNSNISPSPVRKSVTPSPYNVTKHRNRHLQNIQDSQSGKNGITPTTMSTSSSDDFVPVKDGENFCWVHSMEPDRRPSKKRLVDFSNKSNVNVGQVKDIHGRIPEML"
},
"Location": "687..3158",
"Sequence": ""
},
{
"Name": "",
"Source": "",
"Type": "gene",
"Start": 3300,
"End": 4037,
"Complement": true,
"FivePrimePartial": false,
"ThreePrimePartial": false,
"Score": "",
"Strand": "",
"Phase": "",
"Attributes": {
"gene": "REV7"
},
"Location": "complement(3300..4037)",
"Sequence": ""
},
{
"Name": "",
"Source": "",
"Type": "CDS",
"Start": 3300,
"End": 4037,
"Complement": true,
"FivePrimePartial": false,
"ThreePrimePartial": false,
"Score": "",
"Strand": "",
"Phase": "",
"Attributes": {
"codon_start": "1",
"db_xref": "GI:1293616",
"gene": "REV7",
"product": "Rev7p",
"protein_id": "AAA98667.1",
"translation": "MNRWVEKWLRVYLKCYINLILFYRNVYPPQSFDYTTYQSFNLPQFVPINRHPALIDYIEELILDVLSKLTHVYRFSICIINKKNDLCIEKYVLDFSELQHVDKDDQIITETEVFDEFRSSLNSLIMHLEKLPKVNDDTITFEAVINAIELELGHKLDRNRRVDSLEEKAEIERDSNWVKCQEDENLPDNNGFQPPKIKLTSLVGSDVGPLIIHQFSEKLISGDDKILNGVYSQYEEGESIFGSLF"
},
"Location": "complement(3300..4037)",
"Sequence": ""
}
],
"Sequence": {
"Description": "",
"Hash": "",
"HashFunction": "",
"Sequence": "gatcctccatatacaacggtatctccacctcaggtttagatctcaacaacggaaccattgccgacatgagacagttaggtatcgtcgagagttacaagctaaaacgagcagtagtcagctctgcatctgaagccgctgaagttctactaagggtggataacatcatccgtgcaagaccaagaaccgccaatagacaacatatgtaacatatttaggatatacctcgaaaataataaaccgccacactgtcattattataattagaaacagaacgcaaaaattatccactatataattcaaagacgcgaaaaaaaaagaacaacgcgtcatagaacttttggcaattcgcgtcacaaataaattttggcaacttatgtttcctcttcgagcagtactcgagccctgtctcaagaatgtaataatacccatcgtaggtatggttaaagatagcatctccacaacctcaaagctccttgccgagagtcgccctcctttgtcgagtaattttcacttttcatatgagaacttattttcttattctttactctcacatcctgtagtgattgacactgcaacagccaccatcactagaagaacagaacaattacttaatagaaaaattatatcttcctcgaaacgatttcctgcttccaacatctacgtatatcaagaagcattcacttaccatgacacagcttcagatttcattattgctgacagctactatatcactactccatctagtagtggccacgccctatgaggcatatcctatcggaaaacaataccccccagtggcaagagtcaatgaatcgtttacatttcaaatttccaatgatacctataaatcgtctgtagacaagacagctcaaataacatacaattgcttcgacttaccgagctggctttcgtttgactctagttctagaacgttctcaggtgaaccttcttctgacttactatctgatgcgaacaccacgttgtatttcaatgtaatactcgagggtacggactctgccgacagcacgtctttgaacaatacataccaatttgttgttacaaaccgtccatccatctcgctatcgtcagatttcaatctattggcgttgttaaaaaactatggttatactaacggcaaaaacgctctgaaactagatcctaatgaagtcttcaacgtgacttttgaccgttcaatgttcactaacgaagaatccattgtgtcgtattacggacgttctcagttgtataatgcgccgttacccaattggctgttcttcgattctggcgagttgaagtttactgggacggcaccggtgataaactcggcgattgctccagaaacaagctacagttttgtcatcatcgctacagacattgaaggattttctgccgttgaggtagaattcgaattagtcatcggggctcaccagttaactacctctattcaaaatagtttgataatcaacgttactgacacaggtaacgtttcatatgacttacctctaaactatgtttatctcgatgacgatcctatttcttctgataaattgggttctataaacttattggatgctccagactgggtggcattagataatgctaccatttccgggtctgtcccagatgaattactcggtaagaactccaatcctgccaatttttctgtgtccatttatgatacttatggtgatgtgatttatttcaacttcgaagttgtctccacaacggatttgtttgccattagttctcttcccaatattaacgctacaaggggtgaatggttctcctactattttttgccttctcagtttacagactacgtgaatacaaacgtttcattagagtttactaattcaagccaagaccatgactgggtgaaattccaatcatctaatttaacattagctggagaagtgcccaagaatttcgacaagctttcattaggtttgaaagcgaaccaaggttcacaatctcaagagctatattttaacatcattggcatggattcaaagataactcactcaaaccacagtgcgaatgcaacgtccacaagaagttctcaccactccacctcaacaagttcttacacatcttctacttacactgcaaaaatttcttctacctccgctgctgctacttcttctgctccagcagcgctgccagcagccaataaaacttcatctcacaataaaaaagcagtagcaattgcgtgcggtgttgctatcccattaggcgttatcctagtagctctcatttgcttcctaatattctggagacgcagaagggaaaatccagacgatgaaaacttaccgcatgctattagtggacctgatttgaataatcctgcaaataaaccaaatcaagaaaacgctacacctttgaacaacccctttgatgatgatgcttcctcgtacgatgatacttcaatagcaagaagattggctgctttgaacactttgaaattggataaccactctgccactgaatctgatatttccagcgtggatgaaaagagagattctctatcaggtatgaatacatacaatgatcagttccaatcccaaagtaaagaagaattattagcaaaacccccagtacagcctccagagagcccgttctttgacccacagaataggtcttcttctgtgtatatggatagtgaaccagcagtaaataaatcctggcgatatactggcaacctgtcaccagtctctgatattgtcagagacagttacggatcacaaaaaactgttgatacagaaaaacttttcgatttagaagcaccagagaaggaaaaacgtacgtcaagggatgtcactatgtcttcactggacccttggaacagcaatattagcccttctcccgtaagaaaatcagtaacaccatcaccatataacgtaacgaagcatcgtaaccgccacttacaaaatattcaagactctcaaagcggtaaaaacggaatcactcccacaacaatgtcaacttcatcttctgacgattttgttccggttaaagatggtgaaaatttttgctgggtccatagcatggaaccagacagaagaccaagtaagaaaaggttagtagatttttcaaataagagtaatgtcaatgttggtcaagttaaggacattcacggacgcatcccagaaatgctgtgattatacgcaacgatattttgcttaattttattttcctgttttattttttattagtggtttacagataccctatattttatttagtttttatacttagagacatttaattttaattccattcttcaaatttcatttttgcacttaaaacaaagatccaaaaatgctctcgccctcttcatattgagaatacactccattcaaaattttgtcgtcaccgctgattaatttttcactaaactgatgaataatcaaaggccccacgtcagaaccgactaaagaagtgagttttattttaggaggttgaaaaccattattgtctggtaaattttcatcttcttgacatttaacccagtttgaatccctttcaatttctgctttttcctccaaactatcgaccctcctgtttctgtccaacttatgtcctagttccaattcgatcgcattaataactgcttcaaatgttattgtgtcatcgttgactttaggtaatttctccaaatgcataatcaaactatttaaggaagatcggaattcgtcgaacacttcagtttccgtaatgatctgatcgtctttatccacatgttgtaattcactaaaatctaaaacgtatttttcaatgcataaatcgttctttttattaataatgcagatggaaaatctgtaaacgtgcgttaatttagaaagaacatccagtataagttcttctatatagtcaattaaagcaggatgcctattaatgggaacgaactgcggcaagttgaatgactggtaagtagtgtagtcgaatgactgaggtgggtatacatttctataaaataaaatcaaattaatgtagcattttaagtataccctcagccacttctctacccatctattcataaagctgacgcaacgattactattttttttttcttcttggatctcagtcgtcgcaaaaacgtataccttctttttccgaccttttttttagctttctggaaaagtttatattagttaaacagggtctagtcttagtgtgaaagctagtggtttcgattgactgatattaagaaagtggaaattaaattagtagtgtagacgtatatgcatatgtatttctcgcctgtttatgtttctacgtacttttgatttatagcaaggggaaaagaaatacatactattttttggtaaaggtgaaagcataatgtaaaagctagaataaaatggacgaaataaagagaggcttagttcatcttttttccaaaaagcacccaatgataataactaaaatgaaaaggatttgccatctgtcagcaacatcagttgtgtgagcaataataaaatcatcacctccgttgcctttagcgcgtttgtcgtttgtatcttccgtaattttagtcttatcaatgggaatcataaattttccaatgaattagcaatttcgtccaattctttttgagcttcttcatatttgctttggaattcttcgcacttcttttcccattcatctctttcttcttccaaagcaacgatccttctacccatttgctcagagttcaaatcggcctctttcagtttatccattgcttccttcagtttggcttcactgtcttctagctgttgttctagatcctggtttttcttggtgtagttctcattattagatctcaagttattggagtcttcagccaattgctttgtatcagacaattgactctctaacttctccacttcactgtcgagttgctcgtttttagcggacaaagatttaatctcgttttctttttcagtgttagattgctctaattctttgagctgttctctcagctcctcatatttttcttgccatgactcagattctaattttaagctattcaatttctctttgatc"
}
}
19 changes: 13 additions & 6 deletions hash.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,11 @@ import (
// BLAKE2b_384 // import golang.org/x/crypto/blake2b
// BLAKE2b_512 // import golang.org/x/crypto/blake2b

// BoothLeastRotation gets the least rotation of a circular string.
// https://en.wikipedia.org/wiki/Lexicographically_minimal_string_rotation
// this is generally over commented but I'm keeping it this way for now. - Tim
func BoothLeastRotation(sequence string) int {
// boothLeastRotation gets the least rotation of a circular string.
func boothLeastRotation(sequence string) int {

// https://en.wikipedia.org/wiki/Lexicographically_minimal_string_rotation
// this is generally over commented but I'm keeping it this way for now. - Tim

// first concatenate the sequence to itself to avoid modular arithmateic
sequence += sequence // maybe do this as a buffer just for speed? May get annoying with larger sequences.
Expand Down Expand Up @@ -89,8 +90,14 @@ func BoothLeastRotation(sequence string) int {

// RotateSequence rotates circular sequences to deterministic point.
func RotateSequence(sequence string) string {
rotationIndex := BoothLeastRotation(sequence)
concatenatedSequence := sequence + sequence
rotationIndex := boothLeastRotation(sequence)
var sequenceBuilder strings.Builder

// writing the same sequence twice. using build incase of very long circular genome.
sequenceBuilder.WriteString(sequence)
sequenceBuilder.WriteString(sequence)

concatenatedSequence := sequenceBuilder.String()
sequence = concatenatedSequence[rotationIndex : rotationIndex+len(sequence)]
return sequence
}
Expand Down
16 changes: 16 additions & 0 deletions hash_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,13 @@ import (
"lukechampine.com/blake3"
)

func ExampleHash() {
puc19 := ReadGbk("data/puc19.gbk")
fmt.Println(puc19.Hash(blake3.New(32, nil))) // passing new hash.Hash struct to Hasher

// output: 4b0616d1b3fc632e42d78521deb38b44fba95cca9fde159e01cd567fa996ceb9
}

func TestHashRegression(t *testing.T) {
puc19GbkBlake3Hash := "4b0616d1b3fc632e42d78521deb38b44fba95cca9fde159e01cd567fa996ceb9"
puc19 := ReadGbk("data/puc19.gbk")
Expand All @@ -28,6 +35,15 @@ func TestHashRegression(t *testing.T) {
}
}

func ExampleRotateSequence() {
sequence := ReadGbk("data/puc19.gbk")
sequenceLength := len(sequence.Sequence)
testSequence := sequence.Sequence[sequenceLength/2:] + sequence.Sequence[0:sequenceLength/2]

fmt.Println(RotateSequence(sequence.Sequence) == RotateSequence(testSequence))
// output: true
}

func TestLeastRotation(t *testing.T) {
sequence := ReadGbk("data/puc19.gbk")
var sequenceBuffer bytes.Buffer
Expand Down
20 changes: 11 additions & 9 deletions io.go
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,9 @@ GFF specific IO related things begin here.
******************************************************************************/

// ParseGff Takes in a string representing a gffv3 file and parses it into an Sequence object.
func ParseGff(gff string) Sequence {
func ParseGff(gffBytes []byte) Sequence {

gff := string(gffBytes)
sequence := Sequence{}

lines := strings.Split(gff, "\n")
Expand Down Expand Up @@ -327,7 +329,7 @@ func ReadGff(path string) Sequence {
if err != nil {
// return 0, fmt.Errorf("Failed to open file %s for unpack: %s", gzFilePath, err)
} else {
sequence = ParseGff(string(file))
sequence = ParseGff(file)
}
return sequence
}
Expand Down Expand Up @@ -392,8 +394,8 @@ FASTA specific IO related things begin here.
******************************************************************************/

// ParseFASTA parses a Sequence struct from a FASTA file and adds appropriate pointers to the structs.
func ParseFASTA(fasta string) Sequence {

func ParseFASTA(fastaBytes []byte) Sequence {
fasta := string(fastaBytes)
var sequence Sequence
var feature Feature
var features []Feature
Expand Down Expand Up @@ -500,8 +502,8 @@ func ReadFASTA(path string) Sequence {
if err != nil {
// return 0, fmt.Errorf("Failed to open file %s for unpack: %s", gzFilePath, err)
}
annotatedSequenceArray := ParseFASTA(string(file))
return annotatedSequenceArray
sequence := ParseFASTA(file)
return sequence
}

// WriteFASTA writes a Sequence struct out to FASTA.
Expand All @@ -522,8 +524,9 @@ GBK specific IO related things begin here.
******************************************************************************/

// ParseGbk takes in a string representing a gbk/gb/genbank file and parses it into an Sequence object.
func ParseGbk(gbk string) Sequence {
func ParseGbk(gbkBytes []byte) Sequence {

gbk := string(gbkBytes)
lines := strings.Split(gbk, "\n")

// Create meta struct
Expand Down Expand Up @@ -716,8 +719,7 @@ func ReadGbk(path string) Sequence {
if err != nil {
// return 0, fmt.Errorf("Failed to open file %s for unpack: %s", gzFilePath, err)
} else {
gbkString := string(file)
sequence = ParseGbk(gbkString)
sequence = ParseGbk(file)

}
return sequence
Expand Down
Loading

0 comments on commit 9cd47eb

Please sign in to comment.