Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add fasta2 package #337

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions io/fasta2/example_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
package fasta2_test

import (
"fmt"

"github.com/TimothyStiles/poly/io/fasta2"
)

// ExampleReadFile shows basic usage for ReadFile
func ExampleReadFile() {
fastas, _ := fasta2.ReadFile("testdata/base.fasta")

fmt.Println(fastas[0].Header)
// Output: gi|5524211|gb|AAD44166.1| cytochrome b [Elephas maximus maximus]
}
85 changes: 85 additions & 0 deletions io/fasta2/fasta.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
package fasta2

import (
"bytes"
"fmt"
"io"
"os"
)

// Record is a struct representing a single Record element with a Name and its corresponding Sequence.
type Record struct {
Header string `json:"header"`
Sequence string `json:"sequence"`
}

// buffer is a utility method to serialize the Record in a buffer.
func (r Record) buffer() bytes.Buffer {
var b bytes.Buffer
// grow the buffer to allocate just once, the numbers are in order:
// the header + > + \n, the sequence + one \n for each 80 char, the last \n
b.Grow(len(r.Header) + 2 + len(r.Sequence) + (len(r.Sequence) % 80) + 1)
b.WriteByte('>')
b.WriteString(r.Header)
for i, c := range r.Sequence {
// write the fasta sequence 80 characters at a time
if i%80 == 0 {
b.WriteByte('\n')
}
b.WriteRune(c)
}
b.WriteByte('\n')

return b
}

// returns the string representation of a Record.
func (r Record) String() string {
b := r.buffer()
return b.String()
}

// returns the representation of a Record as []byte.
func (r Record) Bytes() []byte {
b := r.buffer()
return b.Bytes()
}

// Writes the Record []byte representation to the passed io.Writer.
func (r Record) Write(w io.Writer) error {
recBytes := r.Bytes()
_, err := w.Write(recBytes)
if err != nil {
return fmt.Errorf("error writing record to io.Writer: %w", err)
}

return nil
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Write transforms the bytes into memory (in the bytes buffer) before actually writing them. Why not just directly write to the io Writer?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah and also call it WriteTo and implement the io.WriterTo interface.
I mostly tried to avoid too many allocations since io.Writer takes a []byte as arguments so it need to be serialized fist in a []byte, and there was already a way to get the record a []byte.

I was actually just looking at it right now, i might send a fixup later.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also writing to byte.Buffer returns almost always nil errors so it's just simpler.

Copy link
Contributor Author

@folago folago Aug 24, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I sent a small fixup, I think at this point the Record.Write() method can be removed, as the user has a way of getting a string and []byte representation of the Record.
For bulk writing the Write() function is now reusing the same buffer (and buffering the I/O is usually a goodthing™).

It would be cool to have a generic Record.Write() method that takes either a bytes.Buffer or a bufio.Writer so we can just decide to pass the buffer or wrap the writer. I unfortunately am traveling tomorrow until next week so no time this weekend 🙁


// Write writes a fasta array to an io.Writer
func Write(recs []Record, w io.Writer) error {
for _, r := range recs {
err := r.Write(w)
if err != nil {
return err
}
}
return nil
}

// WriteFile writes all the passed records to the file at path.
func WriteFile(recs []Record, path string) error {
f, err := os.Create(path)
if err != nil {
return fmt.Errorf("error opening file %q: %w", path, err)
}
defer f.Close()
for _, r := range recs {
err := r.Write(f)
if err != nil {
return fmt.Errorf("error writing to file %q: %w", path, err)
}
}

return nil
}
146 changes: 146 additions & 0 deletions io/fasta2/fasta_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
package fasta2_test

import (
"bytes"
"io"
"os"
"path"
"reflect"
"testing"

"github.com/TimothyStiles/poly/io/fasta2"
"github.com/stretchr/testify/require"
)

func TestFastaString(t *testing.T) {
type fields struct {
Header string
Sequence string
}
tests := []struct {
header string
fields fields
want string
}{
{
header: "success",
fields: fields{
Header: "Cool Sequence",
Sequence: "MDSKGSSQKGSRLLLLLVVSNLLLCQGVVSTPVCPNGPGNCQVSLRDLFDRAVMVSHYIHDLSSEMFNEFDKRYAQGKGFITMALNSCHTSSLPTPEDKEQAQQTHHEVLMSLILGLLRSWNDPLYHL",
},
want: ">Cool Sequence\nMDSKGSSQKGSRLLLLLVVSNLLLCQGVVSTPVCPNGPGNCQVSLRDLFDRAVMVSHYIHDLSSEMFNEFDKRYAQGKGF\nITMALNSCHTSSLPTPEDKEQAQQTHHEVLMSLILGLLRSWNDPLYHL\n",
},
}
for _, tt := range tests {
t.Run(tt.header, func(t *testing.T) {
f := fasta2.Record{
Header: tt.fields.Header,
Sequence: tt.fields.Sequence,
}
if got := f.String(); got != tt.want {
t.Errorf("Record.String() = %v, want %v", got, tt.want)
}
})
}
}

func TestRecord_Bytes(t *testing.T) {
type fields struct {
Header string
Sequence string
}
tests := []struct {
name string
fields fields
want []byte
}{
{
name: "success",
fields: fields{
Header: "Cool Sequence",
Sequence: "MDSKGSSQKGSRLLLLLVVSNLLLCQGVVSTPVCPNGPGNCQVSLRDLFDRAVMVSHYIHDLSSEMFNEFDKRYAQGKGFITMALNSCHTSSLPTPEDKEQAQQTHHEVLMSLILGLLRSWNDPLYHL",
},
want: []byte(">Cool Sequence\nMDSKGSSQKGSRLLLLLVVSNLLLCQGVVSTPVCPNGPGNCQVSLRDLFDRAVMVSHYIHDLSSEMFNEFDKRYAQGKGF\nITMALNSCHTSSLPTPEDKEQAQQTHHEVLMSLILGLLRSWNDPLYHL\n"),
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
r := fasta2.Record{
Header: tt.fields.Header,
Sequence: tt.fields.Sequence,
}
if got := r.Bytes(); !reflect.DeepEqual(got, tt.want) {
t.Errorf("Record.Bytes() = %v, want %v", got, tt.want)
}
})
}
}

func TestRecordWrite(t *testing.T) {
sequence := fasta2.Record{
Header: "Cool Sequence",
Sequence: "MDSKGSSQKGSRLLLLLVVSNLLLCQGVVSTPVCPNGPGNCQVSLRDLFDRAVMVSHYIHDLSSEMFNEFDKRYAQGKGFITMALNSCHTSSLPTPEDKEQAQQTHHEVLMSLILGLLRSWNDPLYHL",
}
expected := ">Cool Sequence\nMDSKGSSQKGSRLLLLLVVSNLLLCQGVVSTPVCPNGPGNCQVSLRDLFDRAVMVSHYIHDLSSEMFNEFDKRYAQGKGF\nITMALNSCHTSSLPTPEDKEQAQQTHHEVLMSLILGLLRSWNDPLYHL\n"
t.Run("success", func(t *testing.T) {
r := sequence
w := &bytes.Buffer{}
err := r.Write(w)
require.NoError(t, err)
require.Equal(t, expected, w.String())
})
t.Run("fail truncated", func(t *testing.T) {
r := sequence
w := errorWriter{}
err := r.Write(w)
require.Error(t, err)
})
}

func TestWrite(t *testing.T) {
recs := []fasta2.Record{
{
Header: "name1",
Sequence: "seq1",
},
{
Header: "name2",
Sequence: "seq2",
},
}
t.Run("success", func(t *testing.T) {
w := &bytes.Buffer{}
err := fasta2.Write(recs, w)
require.NoError(t, err)
require.Equal(t, ">name1\nseq1\n>name2\nseq2\n", w.String())
})
t.Run("fail EOF", func(t *testing.T) {
w := errorWriter{}
err := fasta2.Write(recs, w)
require.Error(t, err)
})
}

func TestWriteFile(t *testing.T) {
path := path.Join(os.TempDir(), "fasta_test")
defer os.Remove(path) // clean up
recs := []fasta2.Record{
{
Header: "name1",
Sequence: "seq1",
},
{
Header: "name2",
Sequence: "seq2",
},
}
err := fasta2.WriteFile(recs, path)
require.NoError(t, err)
}

// errorWriter is a test utility to have errors
type errorWriter struct{}

func (ew errorWriter) Write(p []byte) (n int, err error) {
return len(p), io.EOF
}
128 changes: 128 additions & 0 deletions io/fasta2/parser.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
package fasta2

import (
"bufio"
"bytes"
"fmt"
"io"
"os"
)

// Parser is a fasta parser it is initialized by the NewParser() function.
type Parser struct {
buff bytes.Buffer
header string
start bool
scanner *bufio.Scanner
line int
more bool
}

func NewParser(r io.Reader) *Parser {
return &Parser{
start: true,
more: true,
scanner: bufio.NewScanner(r),
}
}

// Lines returns the number of lines parsed.
func (p *Parser) Lines() int {
return p.line
}

// HasNext returns true if the parser can continue parsing.
func (p *Parser) HasNext() bool {
return p.more
}

func (p *Parser) newRecord() Record {
sequence := p.buff.String()
record := Record{
Header: p.header,
Sequence: sequence,
}
// Reset sequence buffer
p.buff.Reset()
return record
}

// Next parsed the next record in the io.Reader and returns it, in case
// something went wrong an error and the partial result is returned.
func (p *Parser) Next() (Record, error) {
for p.scanner.Scan() {
line := p.scanner.Bytes()
p.line++
switch {
// if there's nothing on this line skip this iteration of the loop
case len(line) == 0:
continue
// if it's a comment skip this line
case line[0] == ';':
continue
// start of file with no header, error
case line[0] != '>' && p.start:
err := fmt.Errorf("invalid input: missing sequence header for sequence starting at line %d", p.line)
record := p.newRecord()
return record, err
// start of a fasta line
case line[0] != '>':
p.buff.Write(line)
// Process normal new lines
case line[0] == '>' && !p.start:
record := p.newRecord()
// New name
p.header = string(line[1:])
return record, nil
// Process first line of file
case line[0] == '>' && p.start:
p.header = string(line[1:])
p.start = false
}
}
p.more = false
// Add final sequence in file
record := p.newRecord()
return record, p.scanner.Err()
}

// ParseAll will parse all the records found in the reader and returns them in
// a slice.
func ParseAll(r io.Reader) ([]Record, error) {
var (
ret []Record
p = NewParser(r)
)

for p.HasNext() {
rec, err := p.Next()
if err != nil {
return ret, err
}
ret = append(ret, rec)
}

return ret, nil
}

// ReadFile will parse all the records found in the file and returns them in
// a slice.
func ReadFile(path string) ([]Record, error) {
var ret []Record
f, err := os.Open(path)
if err != nil {
return nil, fmt.Errorf("error while reading file %q: %w", path, err)
}
defer f.Close()

p := NewParser(f)
for p.HasNext() {
rec, err := p.Next()
if err != nil {
return ret, err
}
ret = append(ret, rec)
}

return ret, nil
}
Loading