Skip to content

Commit

Permalink
document new Seq features
Browse files Browse the repository at this point in the history
  • Loading branch information
jeff-k committed Oct 14, 2024
1 parent 11e08fc commit 8cdf277
Show file tree
Hide file tree
Showing 3 changed files with 82 additions and 35 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ Strings of encoded symbols are packed into [`Seq`](https://docs.rs/bio-seq/lates

kmers are short sequences of length `k` that generally fit into a register (e.g. `usize`, or SIMD vector) and implement `Copy`. `k` is a compile-time constant.

All data is stored little-endian. This effects the order that sequences map to the integers ("colexicographic" order).
All data is stored little-endian. This effects the order that sequences map to the integers:

```rust
for i in 0..=15 {
Expand Down Expand Up @@ -156,7 +156,7 @@ assert_eq!(minimiser, Kmer::from(dna!("GTAAAAAA")));
`Hash` is implemented for sequence and kmer types so equal values of these types will hash identically:

```rust
let seq_arr: &SeqArray<Dna, 32, 1> = dna!("AGCGCTAGTCGTACTGCCGCATCGCTAGCGCT");
let seq_arr: &'static SeqSlice<Dna> = dna!("AGCGCTAGTCGTACTGCCGCATCGCTAGCGCT");
let seq: Seq<Dna> = seq_arr.into();
let seq_slice: &SeqSlice<Dna> = &seq;
let kmer: Kmer<Dna, 32> = seq_arr.into();
Expand Down
7 changes: 7 additions & 0 deletions bio-seq/src/seq/array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,13 @@ use core::ptr;
use core::ops::{BitAnd, BitOr};
use std::hash::{Hash, Hasher};

/// Static bit-packed sequence meant to be accessed as a `&'static SeqSlice`
///
/// ```
/// # use bio_seq::prelude::*;
/// // internally, `dna!` constructs a `SeqArray<Dna>`
/// let seq: &'static SeqSlice<Dna> = dna!("ACGTGT");
/// ```
#[derive(Debug)]
#[repr(transparent)]
pub struct SeqArray<A: Codec, const N: usize, const W: usize> {
Expand Down
106 changes: 73 additions & 33 deletions bio-seq/src/seq/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,30 +3,9 @@
// This file may not be copied, modified, or distributed
// except according to those terms.

//! Arbitrary length sequences of bit-packed genomic data, stored on the heap
//! Arbitrary length sequences of bit-packed genomic data
//!
//! `Seq` and `SeqSlice` are analogous to `String` and `str`. A `Seq` owns its data and a `SeqSlice` is a read-only window into a `Seq`.
//!
//! ```
//! use std::collections::HashMap;
//! use bio_seq::prelude::*;
//!
//! let reference = dna!("ACGTTCGCATGCTACGACGATC");
//!
//! let mut table: HashMap<Seq<Dna>, usize> = HashMap::new();
//!
//! // Associate some kind of count with sequences as keys:
//! table.insert(dna!("ACGTT").into(), 1);
//! table.insert(dna!("ACACCCCC").into(), 0);
//!
//! // The query is a short window in the reference `Seq`
//! let query: &SeqSlice<Dna> = &reference[..5];
//!
//! if let Some(value) = table.get(query) {
//! // `SeqSlice` implements `Display`
//! println!("{query}: {value}");
//! }
//! ```
//! `Seq` and `&SeqSlice` are analogous to `String` and `&str`. A `Seq` owns its data and a `SeqSlice` is a read-only window into a `Seq`.
pub mod index;
pub mod iterators;

Expand Down Expand Up @@ -181,30 +160,66 @@ impl<A: Codec> Seq<A> {
self.bv.clear();
}

/// Truncate a sequence
/// ```
/// # use bio_seq::prelude::*;
/// let mut seq: Seq<Dna> = dna!("CCCCC").into();
/// seq.truncate(2);
/// assert_eq!(&seq, dna!("CC"));
/// ```
pub fn truncate(&mut self, len: usize) {
self.bv.truncate(len * A::BITS as usize);
}

/// Prepend a slice
/// ```
/// # use bio_seq::prelude::*;
/// let mut seq: Seq<Dna> = dna!("CCCCC").into();
/// seq.prepend(dna!("TTTT"));
/// assert_eq!(&seq, dna!("TTTTCCCCC"));
/// ```
pub fn prepend(&mut self, other: &SeqSlice<A>) {
let mut bv = Bv::with_capacity(self.bv.len() + other.bs.len());
bv.extend_from_bitslice(&other.bs);
bv.extend_from_bitslice(&self.bv);
self.bv = bv;
}

/// Append a slice
/// ```
/// # use bio_seq::prelude::*;
/// let mut seq: Seq<Dna> = dna!("CCCCC").into();
/// seq.append(dna!("TTTT"));
/// assert_eq!(&seq, dna!("CCCCCTTTT"));
/// ```
pub fn append(&mut self, other: &SeqSlice<A>) {
self.bv.extend_from_bitslice(&other.bs);
}

/// Remove a range and replace it with a slice
/// ```
/// # use bio_seq::prelude::*;
/// let mut seq: Seq<Dna> = dna!("AAAACCAAAA").into();
/// seq.splice(4..6, dna!("TTTT"));
/// assert_eq!(&seq, dna!("AAAATTTTAAAA"));
/// ```
pub fn splice<R: RangeBounds<usize>>(&mut self, range: R, other: &SeqSlice<A>) {
let (s, e) = self.bit_range(range);
self.bv.splice(s..e, other.bs.iter().by_vals());
}

/// Insert a slice into a sequence
/// ```
/// # use bio_seq::prelude::*;
/// let mut seq: Seq<Dna> = dna!("AAAAA").into();
/// seq.insert(3, dna!("TTTT"));
/// assert_eq!(&seq, dna!("AAATTTTAA"));
/// ```
///
/// # Panics
/// Panics if index out of bounds
pub fn insert(&mut self, index: usize, other: &SeqSlice<A>) {
if index > self.len() {
panic!("Index out of bounds");
}
assert!(index <= self.len(), "Index out of bounds");

let i = index * A::BITS as usize;
let mut bv = Bv::with_capacity(self.bv.len() + other.bs.len());
Expand All @@ -216,6 +231,13 @@ impl<A: Codec> Seq<A> {
self.bv = bv;
}

/// Remove a region of a sequence
/// ```
/// # use bio_seq::prelude::*;
/// let mut seq: Seq<Dna> = dna!("ACGTACGT").into();
/// seq.remove(2..5);
/// assert_eq!(&seq, dna!("ACCGT"));
/// ```
pub fn remove<R: RangeBounds<usize>>(&mut self, range: R) {
let (s, e) = self.bit_range(range);
self.bv.drain(s..e);
Expand All @@ -240,7 +262,14 @@ impl<A: Codec> Seq<A> {
}
}

/// **Experimental**
/// **Experimental** Access raw sequence data us `&[usize]`
/// ```
/// # use bio_seq::prelude::*;
/// let seq: Seq<Dna> = dna!("TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCACGT").into();
/// let ints: Vec<usize> = seq.into_raw().iter().copied().collect();
/// assert_eq!(ints[0], 0b0101010101010101010101010101010111111111111111111111111111111111);
/// assert_eq!(ints[1], 0b11100100); // ACGT
/// ```
pub fn into_raw(&self) -> &[usize] {
self.bv.as_raw_slice()
}
Expand Down Expand Up @@ -273,15 +302,26 @@ impl<A: Codec> PartialEq<&Seq<A>> for Seq<A> {
/// Borrow a `Seq<A>` as a `SeqSlice<A>`.
///
/// The `Borrow` trait to is used to obtain a reference to a `SeqSlice` from a `Seq`, allowing it to be used wherever a `SeqSlice` is expected.
///
/// ```
/// # use bio_seq::prelude::*;
/// use std::borrow::Borrow;
/// use std::collections::HashMap;
/// use bio_seq::prelude::*;
///
/// let seq: Seq<Dna> = dna!("CTACGTACGATCATCG").into();
/// let slice: &SeqSlice<Dna> = seq.borrow();
/// ```
/// let reference = dna!("ACGTTCGCATGCTACGACGATC");
///
/// let mut table: HashMap<Seq<Dna>, usize> = HashMap::new();
///
/// // Associate some kind of count with sequences as keys:
/// table.insert(dna!("ACGTT").into(), 1);
/// table.insert(dna!("ACACCCCC").into(), 0);
///
/// // The query is a short window in the reference `Seq`
/// let query: &SeqSlice<Dna> = &reference[..5];
///
/// if let Some(value) = table.get(query) {
/// // `SeqSlice` implements `Display`
/// println!("{query}: {value}");
/// }
/// ```
impl<A: Codec> Borrow<SeqSlice<A>> for Seq<A> {
fn borrow(&self) -> &SeqSlice<A> {
self.as_ref()
Expand Down

0 comments on commit 8cdf277

Please sign in to comment.