From 8a61472903346fefb5ebb7e532d788936829682f Mon Sep 17 00:00:00 2001 From: jeff-k Date: Wed, 9 Oct 2024 22:25:51 +0100 Subject: [PATCH] KmerStorage definition and rotation operations --- bio-seq/src/kmer/mod.rs | 423 ++++++++++++++++++++++++++++------------ 1 file changed, 303 insertions(+), 120 deletions(-) diff --git a/bio-seq/src/kmer/mod.rs b/bio-seq/src/kmer/mod.rs index 1d07d4f..76b920d 100644 --- a/bio-seq/src/kmer/mod.rs +++ b/bio-seq/src/kmer/mod.rs @@ -1,4 +1,4 @@ -// Copyright 2021, 2022, 2023 Jeff Knaggs +// Copyright 2021-2024 Jeff Knaggs // Licensed under the MIT license (http://opensource.org/licenses/MIT) // This file may not be copied, modified, or distributed // except according to those terms. @@ -26,10 +26,14 @@ //! # use bio_seq::prelude::*; //! let kmer: Kmer = dna!("AGTTGGCA").into(); //! ``` + +// permit truncations that may happen on 32-bit platforms (unsupported) +#![allow(clippy::cast_possible_truncation)] + use crate::codec::Codec; use crate::prelude::{Complement, ParseBioError, ReverseComplement}; use crate::seq::{Seq, SeqArray, SeqSlice}; -use crate::{Ba, Bs, Bv}; +use crate::{Ba, Bs}; use bitvec::field::BitField; use bitvec::view::BitView; use core::fmt; @@ -42,82 +46,206 @@ use core::str::FromStr; #[cfg(feature = "simd")] pub mod simd; -//use bitvec::prelude::*; - #[cfg(feature = "serde")] use serde_derive::{Deserialize, Serialize}; -// TODO -pub trait KmerStorage { - const BITS: usize; - fn new() -> Self; +mod sealed { + use crate::Bs; + + pub trait KmerStorage: Copy + Clone { + const BITS: usize; + type BaN: AsRef + AsMut; + + fn to_bitarray(self) -> Self::BaN; + fn from_bitslice(bs: &Bs) -> Self; + + // fn rotate_left(self, n: u32) -> Self; + // fn rotate_right(self, n: u32) -> Self; + + fn mask(&mut self, bits: usize); + } +} + +pub trait KmerStorage: sealed::KmerStorage {} + +impl sealed::KmerStorage for u32 { + const BITS: usize = u32::BITS as usize; + type BaN = Ba<1>; + + fn to_bitarray(self) -> Ba<1> { + Self::BaN::new([self as usize]) + } + + fn from_bitslice(bs: &Bs) -> Self { + debug_assert!(bs.len() >= 32, "Target SeqSlice data less than 32 bits"); + bs[..32].load_le() + } + + fn mask(&mut self, bits: usize) { + *self &= (1 << bits) - 1; + } } -// TODO -impl KmerStorage for usize { +impl KmerStorage for u32 {} + +impl sealed::KmerStorage for usize { const BITS: usize = usize::BITS as usize; + type BaN = Ba<1>; + + fn to_bitarray(self) -> Ba<1> { + Self::BaN::new([self]) + } - fn new() -> Self { - 0 + fn from_bitslice(bs: &Bs) -> Self { + bs.load_le() + } + + /* + fn rotate_right(self, n: u32) -> Self { + self.rotate_right(n) + } + fn rotate_left(self, n: u32) -> Self { + self.rotate_left(n) + } + */ + + fn mask(&mut self, bits: usize) { + *self &= (1 << bits) - 1; } } -/// By default k-mers are backed by `usize`, `Codec::BITS` * K must be <= 64 +impl KmerStorage for usize {} + +impl sealed::KmerStorage for u64 { + const BITS: usize = u64::BITS as usize; + type BaN = Ba<1>; + + fn to_bitarray(self) -> Ba<1> { + // depends on global assertion that usize::BITS == 64 + Self::BaN::new([self.try_into().unwrap()]) + } + + fn from_bitslice(bs: &Bs) -> Self { + bs.load_le::() + } + + fn mask(&mut self, bits: usize) { + *self &= (1 << bits) - 1; + } +} + +impl KmerStorage for u64 {} + +impl sealed::KmerStorage for u128 { + const BITS: usize = u128::BITS as usize; + type BaN = Ba<2>; + + fn to_bitarray(self) -> Ba<2> { + Self::BaN::new([self as usize, (self >> 64) as usize]) + } + + fn from_bitslice(bs: &Bs) -> Self { + bs.load_le::() + } + + fn mask(&mut self, bits: usize) { + *self &= (1 << bits) - 1; + } +} + +impl KmerStorage for u128 {} + +/// By default k-mers are backed by `usize` and `Codec::BITS` * `K` must be <= 64 on 64-bit platforms #[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Copy, Clone)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] #[repr(transparent)] pub struct Kmer { - #[cfg_attr(feature = "serde", serde(skip))] pub _p: PhantomData, pub bs: S, } -impl Kmer { - /// Push a base from the right: - /// - /// ``` - /// use bio_seq::prelude::*; - /// use bio_seq::codec::dna::Dna; - /// - /// let k = kmer!("ACGAT"); - /// assert_eq!(k.pushr(Dna::T).to_string(), "CGATT"); - /// ``` - pub fn pushr(self, base: A) -> Kmer { - let bs = &Ba::from(base.to_bits() as usize)[..A::BITS as usize]; - let ba = &Ba::from(self.bs); - - let mut x: Bv = Bv::new(); - x.extend_from_bitslice(&ba[A::BITS as usize..A::BITS as usize * K]); - x.extend_from_bitslice(bs); +impl Kmer { + // This error message can be formatted with constants in nightly (const_format) + const _ASSERT_K: () = assert!( + K * A::BITS as usize <= S::BITS, + "`KmerStorage` not large enough for `Kmer`", + ); + + const BITS: usize = K * A::BITS as usize; + + pub fn rotated_left(&self, n: u32) -> Self { + let n: usize = (n as usize % K) * A::BITS as usize; + let mut ba = self.bs.to_bitarray(); + let bs: &mut Bs = ba.as_mut(); + bs[..Self::BITS].rotate_left(n); + Kmer { _p: PhantomData, - bs: x.load_le::(), + bs: S::from_bitslice(&bs[..Self::BITS]), } } - /// Push a base from the left - pub fn pushl(self, base: A) -> Kmer { - let bs = &Ba::from(base.to_bits() as usize)[..A::BITS as usize]; - let ba = &Ba::from(self.bs); + pub fn rotated_right(&self, n: u32) -> Self { + let n: usize = (n as usize % K) * A::BITS as usize; + let mut ba = self.bs.to_bitarray(); + let bs: &mut Bs = ba.as_mut(); + bs[..Self::BITS].rotate_right(n); - let mut x: Bv = Bv::new(); - x.extend_from_bitslice(bs); - x.extend_from_bitslice(&ba[..A::BITS as usize * K - A::BITS as usize]); Kmer { _p: PhantomData, - bs: x.load_le::(), + bs: S::from_bitslice(&bs[..Self::BITS]), } } - /// Iterate through all bases of a Kmer - pub fn iter(self) -> KmerBases { - KmerBases { - _p: PhantomData, - bits: Ba::from(self.bs), - index: 0, + /* + /// Push a base from the right: + /// + /// ``` + /// use bio_seq::prelude::*; + /// use bio_seq::codec::dna::Dna; + /// + /// let k = kmer!("ACGAT"); + /// assert_eq!(k.pushr(Dna::T).to_string(), "CGATT"); + /// ``` + pub fn pushr(self, base: A) -> Kmer { + let bs: usize = base.to_bits().into(); + let ba = S::to_bitarray(self.bs); + + let mut x: Bv = Bv::new(); + x.extend_from_bitslice(&ba[A::BITS as usize..A::BITS as usize * K]); + x.extend_from_bitslice(&bs); + Kmer { + _p: PhantomData, + bs: S::from_bitslice(&x), + } + } + */ + /* + /// Push a base from the left + pub fn pushl(self, base: A) -> Kmer { + let bs = &Ba::from(base.to_bits() as usize)[..A::BITS as usize]; + let ba = &Ba::from(self.bs); + + let mut x: Bv = Bv::new(); + x.extend_from_bitslice(bs); + x.extend_from_bitslice(&ba[..A::BITS as usize * K - A::BITS as usize]); + Kmer { + _p: PhantomData, + bs: x.load_le::(), + } } - } + */ + /* + /// Iterate through all bases of a Kmer + pub fn iter(self) -> KmerBases { + KmerBases { + _p: PhantomData, + bits: Ba::from(self.bs), + index: 0, + } + } + */ /* /// tail pub fn tail(self, base: A) -> Seq { @@ -131,14 +259,8 @@ impl Kmer { */ } -impl From for Kmer { +impl From for Kmer { fn from(i: usize) -> Kmer { - const { - assert!( - K <= usize::BITS as usize / A::BITS as usize, - "K is too large: it should be <= usize::BITS / A::BITS" - ); - }; Kmer { _p: PhantomData, bs: i, @@ -146,17 +268,17 @@ impl From for Kmer { } } -impl From<&SeqArray> for Kmer { - fn from(seq: &SeqArray) -> Kmer { +impl From<&SeqArray> for Kmer { + fn from(seq: &SeqArray) -> Self { Kmer { _p: PhantomData, - bs: seq.into(), + bs: S::from_bitslice(&seq.bs), } } } -impl From<&Kmer> for usize { - fn from(kmer: &Kmer) -> usize { +impl From<&Kmer> for usize { + fn from(kmer: &Kmer) -> usize { kmer.bs } } @@ -188,22 +310,15 @@ impl AsRef> for Kmer { } } -/* -impl From> for usize { - fn from(kmer: Kmer) -> usize { - kmer.bs - } -} -*/ - -impl fmt::Display for Kmer { +impl fmt::Display for Kmer { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let mut s = String::new(); - Ba::from(self.bs)[..K * A::BITS as usize] - .chunks(A::BITS as usize) - .for_each(|chunk| { - s.push(A::unsafe_from_bits(chunk.load_le::()).to_char()); - }); + let ba = self.bs.to_bitarray(); + let bs: &Bs = &ba.as_ref()[0..(K * A::BITS as usize)]; + + bs.chunks(A::BITS as usize).for_each(|chunk| { + s.push(A::unsafe_from_bits(chunk.load_le::()).to_char()); + }); write!(f, "{s}") } } @@ -218,12 +333,6 @@ pub struct KmerIter<'a, A: Codec, const K: usize> { impl Kmer { fn unsafe_from(slice: &SeqSlice) -> Self { - const { - assert!( - K <= usize::BITS as usize / A::BITS as usize, - "K is too large: it should be <= usize::BITS / A::BITS" - ); - }; Kmer { _p: PhantomData, bs: slice.try_into().unwrap(), @@ -231,7 +340,7 @@ impl Kmer { } } -impl<'a, A: Codec, const K: usize> Iterator for KmerIter<'a, A, K> { +impl Iterator for KmerIter<'_, A, K> { type Item = Kmer; fn next(&mut self) -> Option> { let i = self.index; @@ -243,10 +352,11 @@ impl<'a, A: Codec, const K: usize> Iterator for KmerIter<'a, A, K> { } } +/* /// An iterator over the bases of a kmer -pub struct KmerBases { +pub struct KmerBases { pub _p: PhantomData, - pub bits: Ba, + pub bits: S::BaN, pub index: usize, } @@ -264,6 +374,7 @@ impl Iterator for KmerBases { Some(A::unsafe_from_bits(chunk.load_le::())) } } +*/ /// ``` /// use bio_seq::prelude::*; @@ -291,12 +402,6 @@ impl TryFrom<&SeqSlice> for Kmer { type Error = ParseBioError; fn try_from(seq: &SeqSlice) -> Result { - const { - assert!( - K <= usize::BITS as usize / A::BITS as usize, - "K is too large: it should be <= usize::BITS / A::BITS" - ); - }; if seq.len() == K { Ok(Kmer::::unsafe_from(&seq[0..K])) } else { @@ -404,39 +509,39 @@ mod tests { assert_eq!(index as usize, (&kmer).into()); } } + /* + #[test] + fn pushl_test() { + let k = kmer!("ACGT"); + let k1 = k.pushl(Dna::G); + let k2 = k1.pushl(Dna::A); + let k3 = k2.pushl(Dna::T); + let k4 = k3.pushl(Dna::C); + let k5 = k4.pushl(Dna::C); + + assert_eq!(k1, kmer!("GACG")); + assert_eq!(k2, kmer!("AGAC")); + assert_eq!(k3, kmer!("TAGA")); + assert_eq!(k4, kmer!("CTAG")); + assert_eq!(k5, kmer!("CCTA")); + } - #[test] - fn pushl_test() { - let k = kmer!("ACGT"); - let k1 = k.pushl(Dna::G); - let k2 = k1.pushl(Dna::A); - let k3 = k2.pushl(Dna::T); - let k4 = k3.pushl(Dna::C); - let k5 = k4.pushl(Dna::C); - - assert_eq!(k1, kmer!("GACG")); - assert_eq!(k2, kmer!("AGAC")); - assert_eq!(k3, kmer!("TAGA")); - assert_eq!(k4, kmer!("CTAG")); - assert_eq!(k5, kmer!("CCTA")); - } - - #[test] - fn pushr_test() { - let k = kmer!("ACGT"); - let k1 = k.pushr(Dna::G); - let k2 = k1.pushr(Dna::A); - let k3 = k2.pushr(Dna::T); - let k4 = k3.pushr(Dna::C); - let k5 = k4.pushr(Dna::C); - - assert_eq!(k1, kmer!("CGTG")); - assert_eq!(k2, kmer!("GTGA")); - assert_eq!(k3, kmer!("TGAT")); - assert_eq!(k4, kmer!("GATC")); - assert_eq!(k5, kmer!("ATCC")); - } - + #[test] + fn pushr_test() { + let k = kmer!("ACGT"); + let k1 = k.pushr(Dna::G); + let k2 = k1.pushr(Dna::A); + let k3 = k2.pushr(Dna::T); + let k4 = k3.pushr(Dna::C); + let k5 = k4.pushr(Dna::C); + + assert_eq!(k1, kmer!("CGTG")); + assert_eq!(k2, kmer!("GTGA")); + assert_eq!(k3, kmer!("TGAT")); + assert_eq!(k4, kmer!("GATC")); + assert_eq!(k5, kmer!("ATCC")); + } + */ #[test] fn amino_kmer_to_usize() { for (kmer, index) in Seq::::try_from("SRY") @@ -447,6 +552,7 @@ mod tests { assert_eq!(index as usize, usize::from(&kmer)); } } + /* #[test] fn big_kmer_shiftr() { let mut kmer: Kmer = kmer!("AATTTGTGGGTTCGTCTGCGGCTCCGCCCTTA"); @@ -464,6 +570,7 @@ mod tests { } assert_eq!(kmer!("AAACAAGAATACCACGACTAGCAGGAGTATCA"), kmer); } + */ #[test] fn amino_kmer_iter() { for (kmer, target) in Seq::::try_from("SSLMNHKKL") @@ -475,6 +582,82 @@ mod tests { } } + #[test] + fn test_rotations() { + let kmer = Kmer::from(dna!("ACTGCGATG")); + + for (shift, rotation) in vec![ + "ACTGCGATG", + "CTGCGATGA", + "TGCGATGAC", + "GCGATGACT", + "CGATGACTG", + "GATGACTGC", + "ATGACTGCG", + "TGACTGCGA", + "GACTGCGAT", + "ACTGCGATG", + "CTGCGATGA", + "TGCGATGAC", + ] + .into_iter() + .enumerate() + { + // println!("{} {} {}", shift, kmer.rotated_left(shift as u32), rotation); + assert_eq!(kmer.rotated_left(shift as u32), rotation); + } + + for (shift, rotation) in vec![ + "ACTGCGATG", + "GACTGCGAT", + "TGACTGCGA", + "ATGACTGCG", + "GATGACTGC", + "CGATGACTG", + "GCGATGACT", + "TGCGATGAC", + "CTGCGATGA", + "ACTGCGATG", + "GACTGCGAT", + ] + .into_iter() + .enumerate() + { + // println!("{} {} {}", shift, kmer.rotated_right(shift as u32), rotation); + assert_eq!(kmer.rotated_right(shift as u32), rotation); + } + + let kmer: Kmer = Kmer::from(dna!("ACTGCGAT")).rotated_left(1); + + assert_ne!(kmer.to_string(), "ACTGCGAT"); + assert_eq!(kmer.to_string(), "CTGCGATA"); + + let kmer: Kmer = Kmer::from(dna!("ACTGCGAT")).rotated_right(1); + + assert_ne!(kmer.to_string(), "ACTGCGAT"); + assert_eq!(kmer.to_string(), "TACTGCGA"); + + let kmer: Kmer = Kmer::from(dna!("ACTGCGATG")).rotated_left(0); + + assert_eq!(kmer.to_string(), "ACTGCGATG"); + assert_ne!(kmer.to_string(), "ACTGCGATGA"); + + let kmer: Kmer = Kmer::from(dna!("ACTGCGATG")).rotated_right(0); + + assert_eq!(kmer.to_string(), "ACTGCGATG"); + assert_ne!(kmer.to_string(), "ACTGCGATGA"); + + let kmer: Kmer = Kmer::from(dna!("ACTGCGATG")).rotated_left(9 * 3307); + + assert_eq!(kmer.to_string(), "ACTGCGATG"); + assert_ne!(kmer.to_string(), "ACTGCGATGA"); + + let kmer: Kmer = Kmer::from(dna!("ACTGCGATG")).rotated_right(9 * 3307); + + assert_eq!(kmer.to_string(), "ACTGCGATG"); + assert_ne!(kmer.to_string(), "ACTGCGATGA"); + } + /* #[test] fn eq_functions() {