-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
example of how to use binary (de)serialisation
- Loading branch information
Showing
1 changed file
with
223 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,223 @@ | ||
//! This example demonstrates how to safe bio-seq sequences to binary files | ||
//! | ||
//! It encodes amino acid or DNA sequences from the first record of a fasta file | ||
//! and saves them to a binary file along with metadata about the encoding | ||
//! | ||
//! Run with: | ||
//! ```sh | ||
//! cargo run --example seq2bin -- amino path/to/proteins.faa --out proteins.bin | ||
//! cargo run --example seq2bin -- open proteins.bin | ||
//! ``` | ||
#![warn(clippy::pedantic)] | ||
use std::fs::File; | ||
use std::io::{self, BufReader, Read, Write}; | ||
use std::path::PathBuf; | ||
|
||
use bio_seq::prelude::*; | ||
use clap::{Parser, Subcommand}; | ||
use noodles::fasta::Reader; | ||
use serde::{Deserialize, Serialize}; | ||
|
||
/// We need to associate the sequence's codec with the binary data. The | ||
/// codec parameter in `Seq<A: Codec>` is not actually recorded by serde. | ||
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] | ||
enum TaggedSeq { | ||
Dna(Seq<Dna>), | ||
Amino(Seq<Amino>), | ||
} | ||
|
||
#[derive(Parser)] | ||
#[command( | ||
name = "seq2bin", | ||
about = "Read the first record of a fasta file and write its bit-packed encoding to a binary image" | ||
)] | ||
struct Cli { | ||
#[command(subcommand)] | ||
command: Commands, | ||
} | ||
|
||
#[derive(Subcommand)] | ||
enum Commands { | ||
Dna { | ||
input: PathBuf, | ||
|
||
#[arg(short, long)] | ||
out: PathBuf, | ||
}, | ||
Amino { | ||
input: PathBuf, | ||
|
||
#[arg(short, long)] | ||
out: PathBuf, | ||
}, | ||
Open { | ||
input: PathBuf, | ||
}, | ||
} | ||
|
||
/// Save a DNA or Amino acid sequence to a binary file | ||
fn save_bin(seq: &TaggedSeq, path: &PathBuf) -> io::Result<()> { | ||
let bs: Vec<u8> = bincode::serialize(seq).unwrap(); | ||
|
||
let mut fd = File::create(path)?; | ||
fd.write_all(&bs)?; | ||
|
||
Ok(()) | ||
} | ||
|
||
/// Load a sequence from a binary file | ||
fn load_bin(path: &PathBuf) -> io::Result<String> { | ||
let mut buf = Vec::new(); | ||
let mut fd = File::open(path)?; | ||
|
||
fd.read_to_end(&mut buf)?; | ||
|
||
// Deserialise that raw buffer as a tagged Seq struct | ||
let data: TaggedSeq = | ||
bincode::deserialize(&buf).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; | ||
|
||
Ok(match data { | ||
// We can handle sequences based on their encoding | ||
TaggedSeq::Dna(seq) => seq.to_string(), | ||
TaggedSeq::Amino(seq) => seq.to_string(), | ||
}) | ||
} | ||
|
||
/// Read the first record of a fasta file and encode with associated codec | ||
fn load_fasta<A: Codec>(path: &PathBuf) -> io::Result<Seq<A>> { | ||
let mut rdr = Reader::new(BufReader::new(File::open(path)?)); | ||
|
||
// read first fasta record | ||
let record = rdr | ||
.records() | ||
.next() | ||
.ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Empty fasta file")) | ||
.map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; | ||
|
||
// extract sequence from record and try to convert it do a `Seq` | ||
let seq: Seq<A> = record?.sequence().as_ref().try_into()?; | ||
|
||
Ok(seq) | ||
} | ||
|
||
fn main() -> io::Result<()> { | ||
let args = Cli::parse(); | ||
|
||
match args.command { | ||
Commands::Dna { input, out } => { | ||
let seq: Seq<Dna> = load_fasta(&input)?; | ||
save_bin(&TaggedSeq::Dna(seq), &out)?; | ||
} | ||
Commands::Amino { input, out } => { | ||
let seq: Seq<Amino> = load_fasta(&input)?; | ||
save_bin(&TaggedSeq::Amino(seq), &out)?; | ||
} | ||
Commands::Open { input } => { | ||
let seq: String = load_bin(&input)?; | ||
println!("{seq}"); | ||
} | ||
} | ||
|
||
Ok(()) | ||
} | ||
|
||
// These tests were generated by LLM | ||
#[cfg(test)] | ||
mod tests { | ||
use super::*; | ||
use std::io::Write; | ||
use tempfile::NamedTempFile; | ||
|
||
fn create_temp_fasta(sequence: &str) -> io::Result<NamedTempFile> { | ||
let mut file = NamedTempFile::new()?; | ||
writeln!(file, ">test_sequence")?; | ||
writeln!(file, "{}", sequence)?; | ||
Ok(file) | ||
} | ||
|
||
#[test] | ||
fn test_dna_roundtrip() -> io::Result<()> { | ||
// Create a temporary FASTA file with DNA sequence | ||
let raw_seq = | ||
"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATCGATCGAAAAAAAAAAAAA"; | ||
let fasta_file = create_temp_fasta(raw_seq)?; | ||
let bin_file = NamedTempFile::new()?; | ||
|
||
// Load from FASTA and save to binary | ||
let seq: Seq<Dna> = load_fasta(&fasta_file.path().to_path_buf())?; | ||
save_bin(&TaggedSeq::Dna(seq), &bin_file.path().to_path_buf())?; | ||
|
||
// Load back from binary and verify | ||
let loaded_seq = load_bin(&bin_file.path().to_path_buf())?; | ||
assert_eq!(loaded_seq, raw_seq); | ||
|
||
Ok(()) | ||
} | ||
|
||
#[test] | ||
fn test_amino_roundtrip() -> io::Result<()> { | ||
// Create a temporary FASTA file with amino acid sequence | ||
let raw_seq = "NIFLCVWGGVFSRVSLCARGALSPRAPPLL*SVYTLYM*ERGDTRDISQSAHTPHI*KRENTQK"; | ||
let fasta_file = create_temp_fasta(raw_seq)?; | ||
let bin_file = NamedTempFile::new()?; | ||
|
||
// Load from FASTA and save to binary | ||
let seq: Seq<Amino> = load_fasta(&fasta_file.path().to_path_buf())?; | ||
save_bin(&TaggedSeq::Amino(seq), &bin_file.path().to_path_buf())?; | ||
|
||
// Load back from binary and verify | ||
let loaded_seq = load_bin(&bin_file.path().to_path_buf())?; | ||
assert_eq!(loaded_seq, raw_seq); | ||
|
||
Ok(()) | ||
} | ||
|
||
#[test] | ||
fn test_invalid_fasta() { | ||
// Create a temporary FASTA file with invalid sequence | ||
let invalid_seq = "ACTGCTAN"; // Invalid DNA sequence | ||
let fasta_file = create_temp_fasta(invalid_seq).unwrap(); | ||
|
||
// Attempt to load as DNA sequence should fail | ||
let result: Result<Seq<Dna>, _> = load_fasta(&fasta_file.path().to_path_buf()); | ||
assert!(result.is_err()); | ||
} | ||
|
||
#[test] | ||
fn test_empty_fasta() { | ||
// Create an empty FASTA file | ||
let file = NamedTempFile::new().unwrap(); | ||
|
||
// Attempt to load should fail | ||
let result: Result<Seq<Dna>, _> = load_fasta(&file.path().to_path_buf()); | ||
assert!(result.is_err()); | ||
} | ||
|
||
#[test] | ||
fn test_corrupted_binary() { | ||
// Create a binary file with invalid data | ||
let file = NamedTempFile::new().unwrap(); | ||
write!(file.as_file(), "not a valid binary sequence").unwrap(); | ||
|
||
// Attempt to load should fail | ||
let result = load_bin(&file.path().to_path_buf()); | ||
assert!(result.is_err()); | ||
} | ||
|
||
#[test] | ||
fn test_large_sequence() -> io::Result<()> { | ||
// Create a large sequence | ||
let large_dna = "ATCG".repeat(1000); // 4kb sequence | ||
let fasta_file = create_temp_fasta(&large_dna)?; | ||
let bin_file = NamedTempFile::new()?; | ||
|
||
// Test roundtrip | ||
let seq: Seq<Dna> = load_fasta(&fasta_file.path().to_path_buf())?; | ||
save_bin(&TaggedSeq::Dna(seq), &bin_file.path().to_path_buf())?; | ||
let loaded_seq = load_bin(&bin_file.path().to_path_buf())?; | ||
|
||
assert_eq!(loaded_seq, large_dna); | ||
Ok(()) | ||
} | ||
} |