recommend noodles and provide example

jeff-k · Oct 23, 2024 · 88a2f00 · 88a2f00
1 parent 5bcde4f
commit 88a2f00
Show file tree

Hide file tree

Showing 4 changed files with 133 additions and 16 deletions.
diff --git a/README.md b/README.md
@@ -66,6 +66,23 @@ let slice: &str = &s[1..3];
 let seqslice: &SeqSlice<Dna> = &seq[2..4];
 ```
 
+Sequences can be encoded from popular third-party crates like [noodles](https://crates.io/crates/noodles):
+
+```
+let mut reader = noodles::fasta::Reader::new(BufReader::new(fasta));
+
+for result in reader.records() {
+    let record = result?;
+    let seq: Seq<Dna> = record
+        .sequence()
+        .as_ref()
+        .try_into()
+        .unwrap()
+
+    // ...
+}
+```
+
 ## Philosophy
 
 Many bioinformatics crates implement their own kmer packing logic. This effort began as a way to define types and traits that allow kmer code to be shared between projects. It quickly became apparent that a kmer type doesn't make sense without being tightly coupled to a general type for sequences. The scope of this crate will be limited to operating on fixed and arbitrary length sequences with an emphasis on safety.
@@ -123,21 +140,30 @@ for i in 0..=15 {
 A lookup table can be indexed in constant time by treating kmers directly as `usize`:
 
 ```rust
-// This example builds a histogram of kmer occurences (on the stack!)
+struct Histogram<C: Codec, const K: usize> {
+    counts: Vec<usize>,
+    _p: PhantomData<C>,
+}
 
-fn kmer_histogram<C: Codec, const K: usize>(seq: &SeqSlice<C>) -> Vec<usize> {
-    // For dna::Dna our histogram will need 4^4
-    // bins to count every possible 4-mer
-    let mut histo = vec![0; 1 << (C::WIDTH * K as u8)];
+impl<C: Codec, const K: usize> Histogram<C, K> {
+    fn new() -> Self {
+        Self {
+            counts: vec![0; 1 << (K * C::BITS as usize)],
+            _p: PhantomData,
+        }
+    }
 
-    for kmer in seq.kmers::<K>() {
-        histo[usize::from(kmer)] += 1;
+    fn add(&mut self, kmer: Kmer<C, K>) {
+        self.counts[usize::from(&kmer)] += 1;
     }
 
-    histo
+    fn get(&self, kmer: Kmer<C, K>) -> usize {
+        self.counts[usize::from(&kmer)]
+    }
 }
 ```
 
+
 ## Sketching
 
 ### Minimising

diff --git a/bio-seq/Cargo.toml b/bio-seq/Cargo.toml
@@ -12,9 +12,9 @@ categories = ["science::bioinformatics", "science::bioinformatics::genomics", "s
 readme = "../README.md"
 rust-version = "1.79"
 
-#[dev-dependencies]
-#bio-streams = { git="https://github.com/jeff-k/bio-streams.git" }
-#clap = { version="4", features=["derive"] }
+[dev-dependencies]
+clap = { version="4", features=["derive"] }
+noodles = { version="0.84", features=["fasta"] }
 
 [dependencies]
 bitvec = "1"
@@ -28,8 +28,9 @@ translation = []
 simd = []
 extra_codecs = []
 
-#[[example]]
-#name = "aminokmers"
+[[example]]
+name = "aminokmers"
+required-features = ["noodles/fasta"]
 
 [package.metadata.docs.rs]
 features = ["translation", "serde", "extra_codecs"]

diff --git a/bio-seq/examples/aminokmers.rs b/bio-seq/examples/aminokmers.rs
@@ -0,0 +1,88 @@
+//! This example demonstrates using bio-seq with [noodles](https://crates.io/crates/noodles) to count amino acid k-mers.
+//!
+//! It encodes amino acids sequences in an efficient bit-packed representation that can be directly used to index an array of counts.
+//!
+//! Run with:
+//! ```sh
+//! cargo run --example aminokmers -- path/to/proteins.faa
+//! ```
+
+use std::fs::File;
+use std::io::{self, BufReader};
+use std::marker::PhantomData;
+use std::path::PathBuf;
+
+use bio_seq::prelude::*;
+use noodles::fasta::Reader;
+
+use clap::Parser;
+
+#[derive(Parser)]
+#[command(
+    name = "aminokmers",
+    about = "Example of using noodles with bio-seq to count amino acid kmers"
+)]
+struct Cli {
+    #[arg(help = "Path to amino acid fasta file")]
+    fasta: PathBuf,
+}
+
+/// Construct a histogram on the heap that's indexed directly with Kmers.
+struct Histogram<C: Codec, const K: usize> {
+    counts: Vec<usize>,
+    _p: PhantomData<C>,
+}
+
+impl<C: Codec, const K: usize> Histogram<C, K> {
+    fn new() -> Self {
+        Self {
+            counts: vec![0; 1 << (K * C::BITS as usize)],
+            _p: PhantomData,
+        }
+    }
+
+    fn add(&mut self, kmer: Kmer<C, K>) {
+        self.counts[usize::from(&kmer)] += 1;
+    }
+
+    /*
+        fn get(&self, kmer: Kmer<C, K>) -> usize {
+            self.counts[usize::from(&kmer)]
+        }
+    */
+
+    fn items(&self) -> impl Iterator<Item = (Kmer<C, K>, usize)> + '_ {
+        self.counts
+            .iter()
+            .enumerate()
+            .filter(|(_, &count)| count > 0)
+            .map(|(i, &count)| (Kmer::<C, K>::from(i), count))
+    }
+}
+
+fn main() -> io::Result<()> {
+    let args = Cli::parse();
+    let fasta = File::open(&args.fasta)?;
+
+    let mut reader = Reader::new(BufReader::new(fasta));
+    let mut histogram = Histogram::<Amino, 3>::new();
+
+    for result in reader.records() {
+        let record = result?;
+        let seq: Seq<Amino> = record
+            .sequence()
+            .as_ref()
+            .try_into()
+            .expect("could not parse fasta");
+
+        for kmer in seq.kmers() {
+            histogram.add(kmer);
+        }
+    }
+
+    for (kmer, count) in histogram.items() {
+        println!("{kmer}: {count}");
+    }
+
+    Ok(())
+}
diff --git a/bio-seq/src/codec/dna.rs b/bio-seq/src/codec/dna.rs
@@ -14,13 +14,15 @@ pub enum Dna {
 impl Codec for Dna {
     const BITS: u8 = 2;
 
-    /// Take the two least significant bits of a `u8` and map them to the
-    /// corresponding nucleotides.
+    /// Transmute a `u8` into a nucleotide
+    ///
+    /// SAFETY: This only looks at the lower 2 bits of the `u8`
     fn unsafe_from_bits(b: u8) -> Self {
+        debug_assert!(b < 4);
         unsafe { std::mem::transmute(b & 0b11) }
     }
 
-    /// We can efficient verify that a byte is a valid `Dna` value if it's
+    /// We can verify that a byte is a valid `Dna` value if it's
     /// between 0 and 3.
     fn try_from_bits(b: u8) -> Option<Self> {
         if b < 4 {