-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
recommend noodles and provide example
- Loading branch information
Showing
4 changed files
with
133 additions
and
16 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
//! This example demonstrates using bio-seq with [noodles](https://crates.io/crates/noodles) to count amino acid k-mers. | ||
//! | ||
//! It encodes amino acids sequences in an efficient bit-packed representation that can be directly used to index an array of counts. | ||
//! | ||
//! Run with: | ||
//! ```sh | ||
//! cargo run --example aminokmers -- path/to/proteins.faa | ||
//! ``` | ||
use std::fs::File; | ||
use std::io::{self, BufReader}; | ||
use std::marker::PhantomData; | ||
use std::path::PathBuf; | ||
|
||
use bio_seq::prelude::*; | ||
use noodles::fasta::Reader; | ||
|
||
use clap::Parser; | ||
|
||
#[derive(Parser)] | ||
#[command( | ||
name = "aminokmers", | ||
about = "Example of using noodles with bio-seq to count amino acid kmers" | ||
)] | ||
struct Cli { | ||
#[arg(help = "Path to amino acid fasta file")] | ||
fasta: PathBuf, | ||
} | ||
|
||
/// Construct a histogram on the heap that's indexed directly with Kmers. | ||
struct Histogram<C: Codec, const K: usize> { | ||
counts: Vec<usize>, | ||
_p: PhantomData<C>, | ||
} | ||
|
||
impl<C: Codec, const K: usize> Histogram<C, K> { | ||
fn new() -> Self { | ||
Self { | ||
counts: vec![0; 1 << (K * C::BITS as usize)], | ||
_p: PhantomData, | ||
} | ||
} | ||
|
||
fn add(&mut self, kmer: Kmer<C, K>) { | ||
self.counts[usize::from(&kmer)] += 1; | ||
} | ||
|
||
/* | ||
fn get(&self, kmer: Kmer<C, K>) -> usize { | ||
self.counts[usize::from(&kmer)] | ||
} | ||
*/ | ||
|
||
fn items(&self) -> impl Iterator<Item = (Kmer<C, K>, usize)> + '_ { | ||
self.counts | ||
.iter() | ||
.enumerate() | ||
.filter(|(_, &count)| count > 0) | ||
.map(|(i, &count)| (Kmer::<C, K>::from(i), count)) | ||
} | ||
} | ||
|
||
fn main() -> io::Result<()> { | ||
let args = Cli::parse(); | ||
let fasta = File::open(&args.fasta)?; | ||
|
||
let mut reader = Reader::new(BufReader::new(fasta)); | ||
let mut histogram = Histogram::<Amino, 3>::new(); | ||
|
||
for result in reader.records() { | ||
let record = result?; | ||
let seq: Seq<Amino> = record | ||
.sequence() | ||
.as_ref() | ||
.try_into() | ||
.expect("could not parse fasta"); | ||
|
||
for kmer in seq.kmers() { | ||
histogram.add(kmer); | ||
} | ||
} | ||
|
||
for (kmer, count) in histogram.items() { | ||
println!("{kmer}: {count}"); | ||
} | ||
|
||
Ok(()) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters