diff --git a/bio-seq/Cargo.toml b/bio-seq/Cargo.toml index 1066b32..dd28da7 100644 --- a/bio-seq/Cargo.toml +++ b/bio-seq/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "bio-seq" -version = "0.13.0" +version = "0.13.1" authors = ["jeff-k "] edition = "2021" description = "Bit packed and well-typed biological sequences" diff --git a/bio-seq/src/codec/masked.rs b/bio-seq/src/codec/masked.rs new file mode 100644 index 0000000..b23a800 --- /dev/null +++ b/bio-seq/src/codec/masked.rs @@ -0,0 +1,98 @@ +//! Experimental 4-bit DNA encoding with masked bases +//! Includes `N`, `n`, `.`, `-` + +use crate::codec::Codec; +use crate::seq::{ReverseComplement, Seq}; + +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Codec)] +#[bits(4)] +#[repr(u8)] +pub enum Dna { + A = 0b1000, + C = 0b0100, + G = 0b0010, + T = 0b0001, + + #[display('a')] + AMasked = 0b0111, + #[display('c')] + CMasked = 0b1011, + #[display('g')] + GMasked = 0b1101, + #[display('t')] + TMasked = 0b1110, + + N = 0b0000, + #[display('n')] + NMasked = 0b1111, + + #[display('-')] + #[alt(0b0011)] + Gap = 0b1100, + + #[display('.')] + #[alt(0b0101)] + Pad = 0b1010, + + #[display('?')] + Unknown1 = 0b0110, + + #[display('!')] + Unknown2 = 0b1001, +} + +/* +impl Complement for Dna { + /// This representation can be complemented by reversing the bit pattern + fn comp(&self) -> Self { + // reverse the bits + Dna::unsafe_from_bits(b) + } +} +*/ + +impl Dna { + /// Flipping the bit pattern masks/unmasks this representation + pub fn mask(&self) -> Self { + let b = *self as u8 ^ 0b1111; + Dna::unsafe_from_bits(b) + } +} + +impl Seq { + pub fn mask(&self) -> Self { + Self::from(!self.bv.clone()) + } +} + +impl ReverseComplement for Seq { + type Output = Self; + + fn revcomp(&self) -> Self { + let mut bv = self.bv.clone(); + bv.reverse(); + Self::from(bv) + } +} + +#[cfg(test)] +mod tests { + use crate::codec::masked; + use crate::prelude::*; + + #[test] + fn mask_sequence() { + let seq = Seq::::try_from("A.TCGCgtcataN--A").unwrap(); + + assert_ne!(seq.mask().to_string(), "a.tcgcGTGATAN--a".to_string()); + assert_eq!(seq.mask().to_string(), "a.tcgcGTCATAn--a".to_string()); + } + + #[test] + fn masked_revcomp() { + let seq = Seq::::try_from("A.TCGCgtcataN--A").unwrap(); + + assert_ne!(seq.revcomp().to_string(), "T--NtaagacGCGA.T".to_string()); + assert_eq!(seq.revcomp().to_string(), "T--NtatgacGCGA.T".to_string()); + } +} diff --git a/bio-seq/src/codec/mod.rs b/bio-seq/src/codec/mod.rs index 574447c..a594364 100644 --- a/bio-seq/src/codec/mod.rs +++ b/bio-seq/src/codec/mod.rs @@ -119,6 +119,7 @@ use core::hash::Hash; pub mod amino; pub mod dna; pub mod iupac; +pub mod masked; pub mod text; diff --git a/bio-seq/src/seq/mod.rs b/bio-seq/src/seq/mod.rs index 20f20ee..e5ea2a3 100644 --- a/bio-seq/src/seq/mod.rs +++ b/bio-seq/src/seq/mod.rs @@ -466,6 +466,15 @@ impl From> for Seq { } } +impl From for Seq { + fn from(bv: Bv) -> Self { + Seq { + _p: PhantomData, + bv, + } + } +} + #[cfg(test)] mod tests { use crate::prelude::*;