Skip to content

Commit

Permalink
more useful 1-bit degenerate DNA encoding
Browse files Browse the repository at this point in the history
  • Loading branch information
jeff-k committed Oct 23, 2024
1 parent 6b03505 commit b0e1043
Showing 1 changed file with 51 additions and 3 deletions.
54 changes: 51 additions & 3 deletions bio-seq/src/codec/degenerate/dna.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,63 @@ use crate::codec::Codec;
use crate::codec::Complement;
use crate::seq::{ReverseComplement, Seq};

/// 1-bit encoding for `S`trong (`G`/`C`) and `W`eak (`A`/`T`) nucleotides
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Codec)]
#[bits(1)]
/// 1-bit encoding for `S`trong (`G`/`C`) and `W`eak (`A`/`T`) binding nucleotides
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[repr(u8)]
pub enum Dna {
W = 0b0,
S = 0b1,
}

impl Codec for Dna {
const BITS: u8 = 1;

/// Transmute a `u8` into a degenerate 1-bit nucleotide
///
/// SAFETY: This only looks at the lower 2 bits of the `u8`
fn unsafe_from_bits(b: u8) -> Self {
debug_assert!(b < 2);
unsafe { std::mem::transmute(b & 0b1) }
}

/// Valid values are `0` and `1`
fn try_from_bits(b: u8) -> Option<Self> {
if b < 2 {
Some(unsafe { std::mem::transmute::<u8, Dna>(b) })
} else {
None
}
}

/// TODO: fast translation of A, T, W to 0 and C, G, S to 1
fn unsafe_from_ascii(_b: u8) -> Self {
todo!()
}

fn try_from_ascii(c: u8) -> Option<Self> {
match c {
b'S' | b'C' | b'G' => Some(Dna::S),
b'W' | b'A' | b'T' => Some(Dna::W),
_ => None,
}
}

fn to_char(self) -> char {
match self {
Dna::S => 'S',
Dna::W => 'W',
}
}

fn to_bits(self) -> u8 {
self as u8
}

fn items() -> impl Iterator<Item = Self> {
vec![Dna::S, Dna::W].into_iter()
}
}

impl Complement for Dna {
/// This representation collapses complements; `comp(x) == x`
fn comp(&self) -> Self {
Expand Down

0 comments on commit b0e1043

Please sign in to comment.