From 8cdf27766609d6b92c97d13dc65fc5ee147c8b48 Mon Sep 17 00:00:00 2001 From: jeff-k Date: Mon, 14 Oct 2024 23:34:59 +0100 Subject: [PATCH] document new Seq features --- README.md | 4 +- bio-seq/src/seq/array.rs | 7 +++ bio-seq/src/seq/mod.rs | 106 +++++++++++++++++++++++++++------------ 3 files changed, 82 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index 3661c98..32666c6 100644 --- a/README.md +++ b/README.md @@ -91,7 +91,7 @@ Strings of encoded symbols are packed into [`Seq`](https://docs.rs/bio-seq/lates kmers are short sequences of length `k` that generally fit into a register (e.g. `usize`, or SIMD vector) and implement `Copy`. `k` is a compile-time constant. -All data is stored little-endian. This effects the order that sequences map to the integers ("colexicographic" order). +All data is stored little-endian. This effects the order that sequences map to the integers: ```rust for i in 0..=15 { @@ -156,7 +156,7 @@ assert_eq!(minimiser, Kmer::from(dna!("GTAAAAAA"))); `Hash` is implemented for sequence and kmer types so equal values of these types will hash identically: ```rust -let seq_arr: &SeqArray = dna!("AGCGCTAGTCGTACTGCCGCATCGCTAGCGCT"); +let seq_arr: &'static SeqSlice = dna!("AGCGCTAGTCGTACTGCCGCATCGCTAGCGCT"); let seq: Seq = seq_arr.into(); let seq_slice: &SeqSlice = &seq; let kmer: Kmer = seq_arr.into(); diff --git a/bio-seq/src/seq/array.rs b/bio-seq/src/seq/array.rs index 942e9aa..b60a92e 100644 --- a/bio-seq/src/seq/array.rs +++ b/bio-seq/src/seq/array.rs @@ -20,6 +20,13 @@ use core::ptr; use core::ops::{BitAnd, BitOr}; use std::hash::{Hash, Hasher}; +/// Static bit-packed sequence meant to be accessed as a `&'static SeqSlice` +/// +/// ``` +/// # use bio_seq::prelude::*; +/// // internally, `dna!` constructs a `SeqArray` +/// let seq: &'static SeqSlice = dna!("ACGTGT"); +/// ``` #[derive(Debug)] #[repr(transparent)] pub struct SeqArray { diff --git a/bio-seq/src/seq/mod.rs b/bio-seq/src/seq/mod.rs index 87d6338..5209888 100644 --- a/bio-seq/src/seq/mod.rs +++ b/bio-seq/src/seq/mod.rs @@ -3,30 +3,9 @@ // This file may not be copied, modified, or distributed // except according to those terms. -//! Arbitrary length sequences of bit-packed genomic data, stored on the heap +//! Arbitrary length sequences of bit-packed genomic data //! -//! `Seq` and `SeqSlice` are analogous to `String` and `str`. A `Seq` owns its data and a `SeqSlice` is a read-only window into a `Seq`. -//! -//! ``` -//! use std::collections::HashMap; -//! use bio_seq::prelude::*; -//! -//! let reference = dna!("ACGTTCGCATGCTACGACGATC"); -//! -//! let mut table: HashMap, usize> = HashMap::new(); -//! -//! // Associate some kind of count with sequences as keys: -//! table.insert(dna!("ACGTT").into(), 1); -//! table.insert(dna!("ACACCCCC").into(), 0); -//! -//! // The query is a short window in the reference `Seq` -//! let query: &SeqSlice = &reference[..5]; -//! -//! if let Some(value) = table.get(query) { -//! // `SeqSlice` implements `Display` -//! println!("{query}: {value}"); -//! } -//! ``` +//! `Seq` and `&SeqSlice` are analogous to `String` and `&str`. A `Seq` owns its data and a `SeqSlice` is a read-only window into a `Seq`. pub mod index; pub mod iterators; @@ -181,10 +160,24 @@ impl Seq { self.bv.clear(); } + /// Truncate a sequence + /// ``` + /// # use bio_seq::prelude::*; + /// let mut seq: Seq = dna!("CCCCC").into(); + /// seq.truncate(2); + /// assert_eq!(&seq, dna!("CC")); + /// ``` pub fn truncate(&mut self, len: usize) { self.bv.truncate(len * A::BITS as usize); } + /// Prepend a slice + /// ``` + /// # use bio_seq::prelude::*; + /// let mut seq: Seq = dna!("CCCCC").into(); + /// seq.prepend(dna!("TTTT")); + /// assert_eq!(&seq, dna!("TTTTCCCCC")); + /// ``` pub fn prepend(&mut self, other: &SeqSlice) { let mut bv = Bv::with_capacity(self.bv.len() + other.bs.len()); bv.extend_from_bitslice(&other.bs); @@ -192,19 +185,41 @@ impl Seq { self.bv = bv; } + /// Append a slice + /// ``` + /// # use bio_seq::prelude::*; + /// let mut seq: Seq = dna!("CCCCC").into(); + /// seq.append(dna!("TTTT")); + /// assert_eq!(&seq, dna!("CCCCCTTTT")); + /// ``` pub fn append(&mut self, other: &SeqSlice) { self.bv.extend_from_bitslice(&other.bs); } + /// Remove a range and replace it with a slice + /// ``` + /// # use bio_seq::prelude::*; + /// let mut seq: Seq = dna!("AAAACCAAAA").into(); + /// seq.splice(4..6, dna!("TTTT")); + /// assert_eq!(&seq, dna!("AAAATTTTAAAA")); + /// ``` pub fn splice>(&mut self, range: R, other: &SeqSlice) { let (s, e) = self.bit_range(range); self.bv.splice(s..e, other.bs.iter().by_vals()); } + /// Insert a slice into a sequence + /// ``` + /// # use bio_seq::prelude::*; + /// let mut seq: Seq = dna!("AAAAA").into(); + /// seq.insert(3, dna!("TTTT")); + /// assert_eq!(&seq, dna!("AAATTTTAA")); + /// ``` + /// + /// # Panics + /// Panics if index out of bounds pub fn insert(&mut self, index: usize, other: &SeqSlice) { - if index > self.len() { - panic!("Index out of bounds"); - } + assert!(index <= self.len(), "Index out of bounds"); let i = index * A::BITS as usize; let mut bv = Bv::with_capacity(self.bv.len() + other.bs.len()); @@ -216,6 +231,13 @@ impl Seq { self.bv = bv; } + /// Remove a region of a sequence + /// ``` + /// # use bio_seq::prelude::*; + /// let mut seq: Seq = dna!("ACGTACGT").into(); + /// seq.remove(2..5); + /// assert_eq!(&seq, dna!("ACCGT")); + /// ``` pub fn remove>(&mut self, range: R) { let (s, e) = self.bit_range(range); self.bv.drain(s..e); @@ -240,7 +262,14 @@ impl Seq { } } - /// **Experimental** + /// **Experimental** Access raw sequence data us `&[usize]` + /// ``` + /// # use bio_seq::prelude::*; + /// let seq: Seq = dna!("TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCACGT").into(); + /// let ints: Vec = seq.into_raw().iter().copied().collect(); + /// assert_eq!(ints[0], 0b0101010101010101010101010101010111111111111111111111111111111111); + /// assert_eq!(ints[1], 0b11100100); // ACGT + /// ``` pub fn into_raw(&self) -> &[usize] { self.bv.as_raw_slice() } @@ -273,15 +302,26 @@ impl PartialEq<&Seq> for Seq { /// Borrow a `Seq` as a `SeqSlice`. /// /// The `Borrow` trait to is used to obtain a reference to a `SeqSlice` from a `Seq`, allowing it to be used wherever a `SeqSlice` is expected. -/// /// ``` -/// # use bio_seq::prelude::*; -/// use std::borrow::Borrow; +/// use std::collections::HashMap; +/// use bio_seq::prelude::*; /// -/// let seq: Seq = dna!("CTACGTACGATCATCG").into(); -/// let slice: &SeqSlice = seq.borrow(); -/// ``` +/// let reference = dna!("ACGTTCGCATGCTACGACGATC"); +/// +/// let mut table: HashMap, usize> = HashMap::new(); /// +/// // Associate some kind of count with sequences as keys: +/// table.insert(dna!("ACGTT").into(), 1); +/// table.insert(dna!("ACACCCCC").into(), 0); +/// +/// // The query is a short window in the reference `Seq` +/// let query: &SeqSlice = &reference[..5]; +/// +/// if let Some(value) = table.get(query) { +/// // `SeqSlice` implements `Display` +/// println!("{query}: {value}"); +/// } +/// ``` impl Borrow> for Seq { fn borrow(&self) -> &SeqSlice { self.as_ref()