Skip to content

Commit

Permalink
example of how to use binary (de)serialisation
Browse files Browse the repository at this point in the history
  • Loading branch information
jeff-k committed Dec 19, 2024
1 parent af15abf commit c1c07bb
Showing 1 changed file with 223 additions and 0 deletions.
223 changes: 223 additions & 0 deletions bio-seq/examples/seq2bin.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,223 @@
//! This example demonstrates how to safe bio-seq sequences to binary files
//!
//! It encodes amino acid or DNA sequences from the first record of a fasta file
//! and saves them to a binary file along with metadata about the encoding
//!
//! Run with:
//! ```sh
//! cargo run --example seq2bin -- amino path/to/proteins.faa --out proteins.bin
//! cargo run --example seq2bin -- open proteins.bin
//! ```
#![warn(clippy::pedantic)]
use std::fs::File;
use std::io::{self, BufReader, Read, Write};
use std::path::PathBuf;

use bio_seq::prelude::*;
use clap::{Parser, Subcommand};
use noodles::fasta::Reader;
use serde::{Deserialize, Serialize};

/// We need to associate the sequence's codec with the binary data. The
/// codec parameter in `Seq<A: Codec>` is not actually recorded by serde.
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
enum TaggedSeq {
Dna(Seq<Dna>),
Amino(Seq<Amino>),
}

#[derive(Parser)]
#[command(
name = "seq2bin",
about = "Read the first record of a fasta file and write its bit-packed encoding to a binary image"
)]
struct Cli {
#[command(subcommand)]
command: Commands,
}

#[derive(Subcommand)]
enum Commands {
Dna {
input: PathBuf,

#[arg(short, long)]
out: PathBuf,
},
Amino {
input: PathBuf,

#[arg(short, long)]
out: PathBuf,
},
Open {
input: PathBuf,
},
}

/// Save a DNA or Amino acid sequence to a binary file
fn save_bin(seq: &TaggedSeq, path: &PathBuf) -> io::Result<()> {
let bs: Vec<u8> = bincode::serialize(seq).unwrap();

let mut fd = File::create(path)?;
fd.write_all(&bs)?;

Ok(())
}

/// Load a sequence from a binary file
fn load_bin(path: &PathBuf) -> io::Result<String> {
let mut buf = Vec::new();
let mut fd = File::open(path)?;

fd.read_to_end(&mut buf)?;

// Deserialise that raw buffer as a tagged Seq struct
let data: TaggedSeq =
bincode::deserialize(&buf).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;

Ok(match data {
// We can handle sequences based on their encoding
TaggedSeq::Dna(seq) => seq.to_string(),
TaggedSeq::Amino(seq) => seq.to_string(),
})
}

/// Read the first record of a fasta file and encode with associated codec
fn load_fasta<A: Codec>(path: &PathBuf) -> io::Result<Seq<A>> {
let mut rdr = Reader::new(BufReader::new(File::open(path)?));

// read first fasta record
let record = rdr
.records()
.next()
.ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Empty fasta file"))
.map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;

// extract sequence from record and try to convert it do a `Seq`
let seq: Seq<A> = record?.sequence().as_ref().try_into()?;

Ok(seq)
}

fn main() -> io::Result<()> {
let args = Cli::parse();

match args.command {
Commands::Dna { input, out } => {
let seq: Seq<Dna> = load_fasta(&input)?;
save_bin(&TaggedSeq::Dna(seq), &out)?;
}
Commands::Amino { input, out } => {
let seq: Seq<Amino> = load_fasta(&input)?;
save_bin(&TaggedSeq::Amino(seq), &out)?;
}
Commands::Open { input } => {
let seq: String = load_bin(&input)?;
println!("{seq}");
}
}

Ok(())
}

// These tests were generated by LLM
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
use tempfile::NamedTempFile;

fn create_temp_fasta(sequence: &str) -> io::Result<NamedTempFile> {
let mut file = NamedTempFile::new()?;
writeln!(file, ">test_sequence")?;
writeln!(file, "{}", sequence)?;
Ok(file)
}

#[test]
fn test_dna_roundtrip() -> io::Result<()> {
// Create a temporary FASTA file with DNA sequence
let raw_seq =
"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATCGATCGAAAAAAAAAAAAA";
let fasta_file = create_temp_fasta(raw_seq)?;
let bin_file = NamedTempFile::new()?;

// Load from FASTA and save to binary
let seq: Seq<Dna> = load_fasta(&fasta_file.path().to_path_buf())?;
save_bin(&TaggedSeq::Dna(seq), &bin_file.path().to_path_buf())?;

// Load back from binary and verify
let loaded_seq = load_bin(&bin_file.path().to_path_buf())?;
assert_eq!(loaded_seq, raw_seq);

Ok(())
}

#[test]
fn test_amino_roundtrip() -> io::Result<()> {
// Create a temporary FASTA file with amino acid sequence
let raw_seq = "NIFLCVWGGVFSRVSLCARGALSPRAPPLL*SVYTLYM*ERGDTRDISQSAHTPHI*KRENTQK";
let fasta_file = create_temp_fasta(raw_seq)?;
let bin_file = NamedTempFile::new()?;

// Load from FASTA and save to binary
let seq: Seq<Amino> = load_fasta(&fasta_file.path().to_path_buf())?;
save_bin(&TaggedSeq::Amino(seq), &bin_file.path().to_path_buf())?;

// Load back from binary and verify
let loaded_seq = load_bin(&bin_file.path().to_path_buf())?;
assert_eq!(loaded_seq, raw_seq);

Ok(())
}

#[test]
fn test_invalid_fasta() {
// Create a temporary FASTA file with invalid sequence
let invalid_seq = "ACTGCTAN"; // Invalid DNA sequence
let fasta_file = create_temp_fasta(invalid_seq).unwrap();

// Attempt to load as DNA sequence should fail
let result: Result<Seq<Dna>, _> = load_fasta(&fasta_file.path().to_path_buf());
assert!(result.is_err());
}

#[test]
fn test_empty_fasta() {
// Create an empty FASTA file
let file = NamedTempFile::new().unwrap();

// Attempt to load should fail
let result: Result<Seq<Dna>, _> = load_fasta(&file.path().to_path_buf());
assert!(result.is_err());
}

#[test]
fn test_corrupted_binary() {
// Create a binary file with invalid data
let file = NamedTempFile::new().unwrap();
write!(file.as_file(), "not a valid binary sequence").unwrap();

// Attempt to load should fail
let result = load_bin(&file.path().to_path_buf());
assert!(result.is_err());
}

#[test]
fn test_large_sequence() -> io::Result<()> {
// Create a large sequence
let large_dna = "ATCG".repeat(1000); // 4kb sequence
let fasta_file = create_temp_fasta(&large_dna)?;
let bin_file = NamedTempFile::new()?;

// Test roundtrip
let seq: Seq<Dna> = load_fasta(&fasta_file.path().to_path_buf())?;
save_bin(&TaggedSeq::Dna(seq), &bin_file.path().to_path_buf())?;
let loaded_seq = load_bin(&bin_file.path().to_path_buf())?;

assert_eq!(loaded_seq, large_dna);
Ok(())
}
}

0 comments on commit c1c07bb

Please sign in to comment.