Skip to content

Commit

Permalink
initial test of masked sequence encoding (#5)
Browse files Browse the repository at this point in the history
  • Loading branch information
jeff-k committed Sep 22, 2024
1 parent 8debd14 commit 3e49f81
Show file tree
Hide file tree
Showing 4 changed files with 109 additions and 1 deletion.
2 changes: 1 addition & 1 deletion bio-seq/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "bio-seq"
version = "0.13.0"
version = "0.13.1"
authors = ["jeff-k <[email protected]>"]
edition = "2021"
description = "Bit packed and well-typed biological sequences"
Expand Down
98 changes: 98 additions & 0 deletions bio-seq/src/codec/masked.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
//! Experimental 4-bit DNA encoding with masked bases
//! Includes `N`, `n`, `.`, `-`
use crate::codec::Codec;
use crate::seq::{ReverseComplement, Seq};

#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Codec)]
#[bits(4)]
#[repr(u8)]
pub enum Dna {
A = 0b1000,
C = 0b0100,
G = 0b0010,
T = 0b0001,

#[display('a')]
AMasked = 0b0111,
#[display('c')]
CMasked = 0b1011,
#[display('g')]
GMasked = 0b1101,
#[display('t')]
TMasked = 0b1110,

N = 0b0000,
#[display('n')]
NMasked = 0b1111,

#[display('-')]
#[alt(0b0011)]
Gap = 0b1100,

#[display('.')]
#[alt(0b0101)]
Pad = 0b1010,

#[display('?')]
Unknown1 = 0b0110,

#[display('!')]
Unknown2 = 0b1001,
}

/*
impl Complement for Dna {
/// This representation can be complemented by reversing the bit pattern
fn comp(&self) -> Self {
// reverse the bits
Dna::unsafe_from_bits(b)
}
}
*/

impl Dna {
/// Flipping the bit pattern masks/unmasks this representation
pub fn mask(&self) -> Self {
let b = *self as u8 ^ 0b1111;
Dna::unsafe_from_bits(b)
}
}

impl Seq<Dna> {
pub fn mask(&self) -> Self {
Self::from(!self.bv.clone())
}
}

impl ReverseComplement for Seq<Dna> {
type Output = Self;

fn revcomp(&self) -> Self {
let mut bv = self.bv.clone();
bv.reverse();
Self::from(bv)
}
}

#[cfg(test)]
mod tests {
use crate::codec::masked;
use crate::prelude::*;

#[test]
fn mask_sequence() {
let seq = Seq::<masked::Dna>::try_from("A.TCGCgtcataN--A").unwrap();

assert_ne!(seq.mask().to_string(), "a.tcgcGTGATAN--a".to_string());
assert_eq!(seq.mask().to_string(), "a.tcgcGTCATAn--a".to_string());
}

#[test]
fn masked_revcomp() {
let seq = Seq::<masked::Dna>::try_from("A.TCGCgtcataN--A").unwrap();

assert_ne!(seq.revcomp().to_string(), "T--NtaagacGCGA.T".to_string());
assert_eq!(seq.revcomp().to_string(), "T--NtatgacGCGA.T".to_string());
}
}
1 change: 1 addition & 0 deletions bio-seq/src/codec/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ use core::hash::Hash;
pub mod amino;
pub mod dna;
pub mod iupac;
pub mod masked;

pub mod text;

Expand Down
9 changes: 9 additions & 0 deletions bio-seq/src/seq/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -466,6 +466,15 @@ impl From<Vec<usize>> for Seq<text::Dna> {
}
}

impl<A: Codec> From<Bv> for Seq<A> {
fn from(bv: Bv) -> Self {
Seq {
_p: PhantomData,
bv,
}
}
}

#[cfg(test)]
mod tests {
use crate::prelude::*;
Expand Down

0 comments on commit 3e49f81

Please sign in to comment.