Skip to content

Commit

Permalink
recommend noodles and provide example
Browse files Browse the repository at this point in the history
  • Loading branch information
jeff-k committed Oct 23, 2024
1 parent 5bcde4f commit 88a2f00
Show file tree
Hide file tree
Showing 4 changed files with 133 additions and 16 deletions.
42 changes: 34 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,23 @@ let slice: &str = &s[1..3];
let seqslice: &SeqSlice<Dna> = &seq[2..4];
```

Sequences can be encoded from popular third-party crates like [noodles](https://crates.io/crates/noodles):

```
let mut reader = noodles::fasta::Reader::new(BufReader::new(fasta));
for result in reader.records() {
let record = result?;
let seq: Seq<Dna> = record
.sequence()
.as_ref()
.try_into()
.unwrap()
// ...
}
```

## Philosophy

Many bioinformatics crates implement their own kmer packing logic. This effort began as a way to define types and traits that allow kmer code to be shared between projects. It quickly became apparent that a kmer type doesn't make sense without being tightly coupled to a general type for sequences. The scope of this crate will be limited to operating on fixed and arbitrary length sequences with an emphasis on safety.
Expand Down Expand Up @@ -123,21 +140,30 @@ for i in 0..=15 {
A lookup table can be indexed in constant time by treating kmers directly as `usize`:

```rust
// This example builds a histogram of kmer occurences (on the stack!)
struct Histogram<C: Codec, const K: usize> {
counts: Vec<usize>,
_p: PhantomData<C>,
}

fn kmer_histogram<C: Codec, const K: usize>(seq: &SeqSlice<C>) -> Vec<usize> {
// For dna::Dna our histogram will need 4^4
// bins to count every possible 4-mer
let mut histo = vec![0; 1 << (C::WIDTH * K as u8)];
impl<C: Codec, const K: usize> Histogram<C, K> {
fn new() -> Self {
Self {
counts: vec![0; 1 << (K * C::BITS as usize)],
_p: PhantomData,
}
}

for kmer in seq.kmers::<K>() {
histo[usize::from(kmer)] += 1;
fn add(&mut self, kmer: Kmer<C, K>) {
self.counts[usize::from(&kmer)] += 1;
}

histo
fn get(&self, kmer: Kmer<C, K>) -> usize {
self.counts[usize::from(&kmer)]
}
}
```


## Sketching

### Minimising
Expand Down
11 changes: 6 additions & 5 deletions bio-seq/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@ categories = ["science::bioinformatics", "science::bioinformatics::genomics", "s
readme = "../README.md"
rust-version = "1.79"

#[dev-dependencies]
#bio-streams = { git="https://github.com/jeff-k/bio-streams.git" }
#clap = { version="4", features=["derive"] }
[dev-dependencies]
clap = { version="4", features=["derive"] }
noodles = { version="0.84", features=["fasta"] }

[dependencies]
bitvec = "1"
Expand All @@ -28,8 +28,9 @@ translation = []
simd = []
extra_codecs = []

#[[example]]
#name = "aminokmers"
[[example]]
name = "aminokmers"
required-features = ["noodles/fasta"]

[package.metadata.docs.rs]
features = ["translation", "serde", "extra_codecs"]
Expand Down
88 changes: 88 additions & 0 deletions bio-seq/examples/aminokmers.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
//! This example demonstrates using bio-seq with [noodles](https://crates.io/crates/noodles) to count amino acid k-mers.
//!
//! It encodes amino acids sequences in an efficient bit-packed representation that can be directly used to index an array of counts.
//!
//! Run with:
//! ```sh
//! cargo run --example aminokmers -- path/to/proteins.faa
//! ```
use std::fs::File;
use std::io::{self, BufReader};
use std::marker::PhantomData;
use std::path::PathBuf;

use bio_seq::prelude::*;
use noodles::fasta::Reader;

use clap::Parser;

#[derive(Parser)]
#[command(
name = "aminokmers",
about = "Example of using noodles with bio-seq to count amino acid kmers"
)]
struct Cli {
#[arg(help = "Path to amino acid fasta file")]
fasta: PathBuf,
}

/// Construct a histogram on the heap that's indexed directly with Kmers.
struct Histogram<C: Codec, const K: usize> {
counts: Vec<usize>,
_p: PhantomData<C>,
}

impl<C: Codec, const K: usize> Histogram<C, K> {
fn new() -> Self {
Self {
counts: vec![0; 1 << (K * C::BITS as usize)],
_p: PhantomData,
}
}

fn add(&mut self, kmer: Kmer<C, K>) {
self.counts[usize::from(&kmer)] += 1;
}

/*
fn get(&self, kmer: Kmer<C, K>) -> usize {
self.counts[usize::from(&kmer)]
}
*/

fn items(&self) -> impl Iterator<Item = (Kmer<C, K>, usize)> + '_ {
self.counts
.iter()
.enumerate()
.filter(|(_, &count)| count > 0)
.map(|(i, &count)| (Kmer::<C, K>::from(i), count))
}
}

fn main() -> io::Result<()> {
let args = Cli::parse();
let fasta = File::open(&args.fasta)?;

let mut reader = Reader::new(BufReader::new(fasta));
let mut histogram = Histogram::<Amino, 3>::new();

for result in reader.records() {
let record = result?;
let seq: Seq<Amino> = record
.sequence()
.as_ref()
.try_into()
.expect("could not parse fasta");

for kmer in seq.kmers() {
histogram.add(kmer);
}
}

for (kmer, count) in histogram.items() {
println!("{kmer}: {count}");
}

Ok(())
}
8 changes: 5 additions & 3 deletions bio-seq/src/codec/dna.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,15 @@ pub enum Dna {
impl Codec for Dna {
const BITS: u8 = 2;

/// Take the two least significant bits of a `u8` and map them to the
/// corresponding nucleotides.
/// Transmute a `u8` into a nucleotide
///
/// SAFETY: This only looks at the lower 2 bits of the `u8`
fn unsafe_from_bits(b: u8) -> Self {
debug_assert!(b < 4);
unsafe { std::mem::transmute(b & 0b11) }
}

/// We can efficient verify that a byte is a valid `Dna` value if it's
/// We can verify that a byte is a valid `Dna` value if it's
/// between 0 and 3.
fn try_from_bits(b: u8) -> Option<Self> {
if b < 4 {
Expand Down

0 comments on commit 88a2f00

Please sign in to comment.