-
-
Notifications
You must be signed in to change notification settings - Fork 73
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add fasta2 package #337
Closed
Closed
Add fasta2 package #337
Changes from 1 commit
Commits
Show all changes
3 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
package fasta2_test | ||
|
||
import ( | ||
"fmt" | ||
|
||
"github.com/TimothyStiles/poly/io/fasta2" | ||
) | ||
|
||
// ExampleReadFile shows basic usage for ReadFile | ||
func ExampleReadFile() { | ||
fastas, _ := fasta2.ReadFile("testdata/base.fasta") | ||
|
||
fmt.Println(fastas[0].Header) | ||
// Output: gi|5524211|gb|AAD44166.1| cytochrome b [Elephas maximus maximus] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
package fasta2 | ||
|
||
import ( | ||
"bytes" | ||
"fmt" | ||
"io" | ||
"os" | ||
) | ||
|
||
// Record is a struct representing a single Record element with a Name and its corresponding Sequence. | ||
type Record struct { | ||
Header string `json:"header"` | ||
Sequence string `json:"sequence"` | ||
} | ||
|
||
// buffer is a utility method to serialize the Record in a buffer. | ||
func (r Record) buffer() bytes.Buffer { | ||
var b bytes.Buffer | ||
// grow the buffer to allocate just once, the numbers are in order: | ||
// the header + > + \n, the sequence + one \n for each 80 char, the last \n | ||
b.Grow(len(r.Header) + 2 + len(r.Sequence) + (len(r.Sequence) % 80) + 1) | ||
b.WriteByte('>') | ||
b.WriteString(r.Header) | ||
for i, c := range r.Sequence { | ||
// write the fasta sequence 80 characters at a time | ||
if i%80 == 0 { | ||
b.WriteByte('\n') | ||
} | ||
b.WriteRune(c) | ||
} | ||
b.WriteByte('\n') | ||
|
||
return b | ||
} | ||
|
||
// returns the string representation of a Record. | ||
func (r Record) String() string { | ||
b := r.buffer() | ||
return b.String() | ||
} | ||
|
||
// returns the representation of a Record as []byte. | ||
func (r Record) Bytes() []byte { | ||
b := r.buffer() | ||
return b.Bytes() | ||
} | ||
|
||
// Writes the Record []byte representation to the passed io.Writer. | ||
func (r Record) Write(w io.Writer) error { | ||
recBytes := r.Bytes() | ||
_, err := w.Write(recBytes) | ||
if err != nil { | ||
return fmt.Errorf("error writing record to io.Writer: %w", err) | ||
} | ||
|
||
return nil | ||
} | ||
|
||
// Write writes a fasta array to an io.Writer | ||
func Write(recs []Record, w io.Writer) error { | ||
for _, r := range recs { | ||
err := r.Write(w) | ||
if err != nil { | ||
return err | ||
} | ||
} | ||
return nil | ||
} | ||
|
||
// WriteFile writes all the passed records to the file at path. | ||
func WriteFile(recs []Record, path string) error { | ||
f, err := os.Create(path) | ||
if err != nil { | ||
return fmt.Errorf("error opening file %q: %w", path, err) | ||
} | ||
defer f.Close() | ||
for _, r := range recs { | ||
err := r.Write(f) | ||
if err != nil { | ||
return fmt.Errorf("error writing to file %q: %w", path, err) | ||
} | ||
} | ||
|
||
return nil | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,146 @@ | ||
package fasta2_test | ||
|
||
import ( | ||
"bytes" | ||
"io" | ||
"os" | ||
"path" | ||
"reflect" | ||
"testing" | ||
|
||
"github.com/TimothyStiles/poly/io/fasta2" | ||
"github.com/stretchr/testify/require" | ||
) | ||
|
||
func TestFastaString(t *testing.T) { | ||
type fields struct { | ||
Header string | ||
Sequence string | ||
} | ||
tests := []struct { | ||
header string | ||
fields fields | ||
want string | ||
}{ | ||
{ | ||
header: "success", | ||
fields: fields{ | ||
Header: "Cool Sequence", | ||
Sequence: "MDSKGSSQKGSRLLLLLVVSNLLLCQGVVSTPVCPNGPGNCQVSLRDLFDRAVMVSHYIHDLSSEMFNEFDKRYAQGKGFITMALNSCHTSSLPTPEDKEQAQQTHHEVLMSLILGLLRSWNDPLYHL", | ||
}, | ||
want: ">Cool Sequence\nMDSKGSSQKGSRLLLLLVVSNLLLCQGVVSTPVCPNGPGNCQVSLRDLFDRAVMVSHYIHDLSSEMFNEFDKRYAQGKGF\nITMALNSCHTSSLPTPEDKEQAQQTHHEVLMSLILGLLRSWNDPLYHL\n", | ||
}, | ||
} | ||
for _, tt := range tests { | ||
t.Run(tt.header, func(t *testing.T) { | ||
f := fasta2.Record{ | ||
Header: tt.fields.Header, | ||
Sequence: tt.fields.Sequence, | ||
} | ||
if got := f.String(); got != tt.want { | ||
t.Errorf("Record.String() = %v, want %v", got, tt.want) | ||
} | ||
}) | ||
} | ||
} | ||
|
||
func TestRecord_Bytes(t *testing.T) { | ||
type fields struct { | ||
Header string | ||
Sequence string | ||
} | ||
tests := []struct { | ||
name string | ||
fields fields | ||
want []byte | ||
}{ | ||
{ | ||
name: "success", | ||
fields: fields{ | ||
Header: "Cool Sequence", | ||
Sequence: "MDSKGSSQKGSRLLLLLVVSNLLLCQGVVSTPVCPNGPGNCQVSLRDLFDRAVMVSHYIHDLSSEMFNEFDKRYAQGKGFITMALNSCHTSSLPTPEDKEQAQQTHHEVLMSLILGLLRSWNDPLYHL", | ||
}, | ||
want: []byte(">Cool Sequence\nMDSKGSSQKGSRLLLLLVVSNLLLCQGVVSTPVCPNGPGNCQVSLRDLFDRAVMVSHYIHDLSSEMFNEFDKRYAQGKGF\nITMALNSCHTSSLPTPEDKEQAQQTHHEVLMSLILGLLRSWNDPLYHL\n"), | ||
}, | ||
} | ||
for _, tt := range tests { | ||
t.Run(tt.name, func(t *testing.T) { | ||
r := fasta2.Record{ | ||
Header: tt.fields.Header, | ||
Sequence: tt.fields.Sequence, | ||
} | ||
if got := r.Bytes(); !reflect.DeepEqual(got, tt.want) { | ||
t.Errorf("Record.Bytes() = %v, want %v", got, tt.want) | ||
} | ||
}) | ||
} | ||
} | ||
|
||
func TestRecordWrite(t *testing.T) { | ||
sequence := fasta2.Record{ | ||
Header: "Cool Sequence", | ||
Sequence: "MDSKGSSQKGSRLLLLLVVSNLLLCQGVVSTPVCPNGPGNCQVSLRDLFDRAVMVSHYIHDLSSEMFNEFDKRYAQGKGFITMALNSCHTSSLPTPEDKEQAQQTHHEVLMSLILGLLRSWNDPLYHL", | ||
} | ||
expected := ">Cool Sequence\nMDSKGSSQKGSRLLLLLVVSNLLLCQGVVSTPVCPNGPGNCQVSLRDLFDRAVMVSHYIHDLSSEMFNEFDKRYAQGKGF\nITMALNSCHTSSLPTPEDKEQAQQTHHEVLMSLILGLLRSWNDPLYHL\n" | ||
t.Run("success", func(t *testing.T) { | ||
r := sequence | ||
w := &bytes.Buffer{} | ||
err := r.Write(w) | ||
require.NoError(t, err) | ||
require.Equal(t, expected, w.String()) | ||
}) | ||
t.Run("fail truncated", func(t *testing.T) { | ||
r := sequence | ||
w := errorWriter{} | ||
err := r.Write(w) | ||
require.Error(t, err) | ||
}) | ||
} | ||
|
||
func TestWrite(t *testing.T) { | ||
recs := []fasta2.Record{ | ||
{ | ||
Header: "name1", | ||
Sequence: "seq1", | ||
}, | ||
{ | ||
Header: "name2", | ||
Sequence: "seq2", | ||
}, | ||
} | ||
t.Run("success", func(t *testing.T) { | ||
w := &bytes.Buffer{} | ||
err := fasta2.Write(recs, w) | ||
require.NoError(t, err) | ||
require.Equal(t, ">name1\nseq1\n>name2\nseq2\n", w.String()) | ||
}) | ||
t.Run("fail EOF", func(t *testing.T) { | ||
w := errorWriter{} | ||
err := fasta2.Write(recs, w) | ||
require.Error(t, err) | ||
}) | ||
} | ||
|
||
func TestWriteFile(t *testing.T) { | ||
path := path.Join(os.TempDir(), "fasta_test") | ||
defer os.Remove(path) // clean up | ||
recs := []fasta2.Record{ | ||
{ | ||
Header: "name1", | ||
Sequence: "seq1", | ||
}, | ||
{ | ||
Header: "name2", | ||
Sequence: "seq2", | ||
}, | ||
} | ||
err := fasta2.WriteFile(recs, path) | ||
require.NoError(t, err) | ||
} | ||
|
||
// errorWriter is a test utility to have errors | ||
type errorWriter struct{} | ||
|
||
func (ew errorWriter) Write(p []byte) (n int, err error) { | ||
return len(p), io.EOF | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,128 @@ | ||
package fasta2 | ||
|
||
import ( | ||
"bufio" | ||
"bytes" | ||
"fmt" | ||
"io" | ||
"os" | ||
) | ||
|
||
// Parser is a fasta parser it is initialized by the NewParser() function. | ||
type Parser struct { | ||
buff bytes.Buffer | ||
header string | ||
start bool | ||
scanner *bufio.Scanner | ||
line int | ||
more bool | ||
} | ||
|
||
func NewParser(r io.Reader) *Parser { | ||
return &Parser{ | ||
start: true, | ||
more: true, | ||
scanner: bufio.NewScanner(r), | ||
} | ||
} | ||
|
||
// Lines returns the number of lines parsed. | ||
func (p *Parser) Lines() int { | ||
return p.line | ||
} | ||
|
||
// HasNext returns true if the parser can continue parsing. | ||
func (p *Parser) HasNext() bool { | ||
return p.more | ||
} | ||
|
||
func (p *Parser) newRecord() Record { | ||
sequence := p.buff.String() | ||
record := Record{ | ||
Header: p.header, | ||
Sequence: sequence, | ||
} | ||
// Reset sequence buffer | ||
p.buff.Reset() | ||
return record | ||
} | ||
|
||
// Next parsed the next record in the io.Reader and returns it, in case | ||
// something went wrong an error and the partial result is returned. | ||
func (p *Parser) Next() (Record, error) { | ||
for p.scanner.Scan() { | ||
line := p.scanner.Bytes() | ||
p.line++ | ||
switch { | ||
// if there's nothing on this line skip this iteration of the loop | ||
case len(line) == 0: | ||
continue | ||
// if it's a comment skip this line | ||
case line[0] == ';': | ||
continue | ||
// start of file with no header, error | ||
case line[0] != '>' && p.start: | ||
err := fmt.Errorf("invalid input: missing sequence header for sequence starting at line %d", p.line) | ||
record := p.newRecord() | ||
return record, err | ||
// start of a fasta line | ||
case line[0] != '>': | ||
p.buff.Write(line) | ||
// Process normal new lines | ||
case line[0] == '>' && !p.start: | ||
record := p.newRecord() | ||
// New name | ||
p.header = string(line[1:]) | ||
return record, nil | ||
// Process first line of file | ||
case line[0] == '>' && p.start: | ||
p.header = string(line[1:]) | ||
p.start = false | ||
} | ||
} | ||
p.more = false | ||
// Add final sequence in file | ||
record := p.newRecord() | ||
return record, p.scanner.Err() | ||
} | ||
|
||
// ParseAll will parse all the records found in the reader and returns them in | ||
// a slice. | ||
func ParseAll(r io.Reader) ([]Record, error) { | ||
var ( | ||
ret []Record | ||
p = NewParser(r) | ||
) | ||
|
||
for p.HasNext() { | ||
rec, err := p.Next() | ||
if err != nil { | ||
return ret, err | ||
} | ||
ret = append(ret, rec) | ||
} | ||
|
||
return ret, nil | ||
} | ||
|
||
// ReadFile will parse all the records found in the file and returns them in | ||
// a slice. | ||
func ReadFile(path string) ([]Record, error) { | ||
var ret []Record | ||
f, err := os.Open(path) | ||
if err != nil { | ||
return nil, fmt.Errorf("error while reading file %q: %w", path, err) | ||
} | ||
defer f.Close() | ||
|
||
p := NewParser(f) | ||
for p.HasNext() { | ||
rec, err := p.Next() | ||
if err != nil { | ||
return ret, err | ||
} | ||
ret = append(ret, rec) | ||
} | ||
|
||
return ret, nil | ||
} |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Write transforms the bytes into memory (in the bytes buffer) before actually writing them. Why not just directly write to the io Writer?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah and also call it
WriteTo
and implement theio.WriterTo
interface.I mostly tried to avoid too many allocations since
io.Writer
takes a[]byte
as arguments so it need to be serialized fist in a[]byte
, and there was already a way to get the record a[]byte
.I was actually just looking at it right now, i might send a fixup later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also writing to
byte.Buffer
returns almost alwaysnil
errors so it's just simpler.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I sent a small fixup, I think at this point the
Record.Write()
method can be removed, as the user has a way of getting astring
and[]byte
representation of theRecord
.For bulk writing the
Write()
function is now reusing the same buffer (and buffering the I/O is usually a goodthing™).It would be cool to have a generic
Record.Write()
method that takes either abytes.Buffer
or abufio.Writer
so we can just decide to pass the buffer or wrap the writer. I unfortunately am traveling tomorrow until next week so no time this weekend 🙁