From b0e1043247010e0da34e5b21ae4b2ec8b8e93325 Mon Sep 17 00:00:00 2001 From: jeff-k Date: Wed, 23 Oct 2024 19:56:17 +0100 Subject: [PATCH] more useful 1-bit degenerate DNA encoding --- bio-seq/src/codec/degenerate/dna.rs | 54 +++++++++++++++++++++++++++-- 1 file changed, 51 insertions(+), 3 deletions(-) diff --git a/bio-seq/src/codec/degenerate/dna.rs b/bio-seq/src/codec/degenerate/dna.rs index 58055bc..37fa2c3 100644 --- a/bio-seq/src/codec/degenerate/dna.rs +++ b/bio-seq/src/codec/degenerate/dna.rs @@ -2,15 +2,63 @@ use crate::codec::Codec; use crate::codec::Complement; use crate::seq::{ReverseComplement, Seq}; -/// 1-bit encoding for `S`trong (`G`/`C`) and `W`eak (`A`/`T`) nucleotides -#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Codec)] -#[bits(1)] +/// 1-bit encoding for `S`trong (`G`/`C`) and `W`eak (`A`/`T`) binding nucleotides +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] #[repr(u8)] pub enum Dna { W = 0b0, S = 0b1, } +impl Codec for Dna { + const BITS: u8 = 1; + + /// Transmute a `u8` into a degenerate 1-bit nucleotide + /// + /// SAFETY: This only looks at the lower 2 bits of the `u8` + fn unsafe_from_bits(b: u8) -> Self { + debug_assert!(b < 2); + unsafe { std::mem::transmute(b & 0b1) } + } + + /// Valid values are `0` and `1` + fn try_from_bits(b: u8) -> Option { + if b < 2 { + Some(unsafe { std::mem::transmute::(b) }) + } else { + None + } + } + + /// TODO: fast translation of A, T, W to 0 and C, G, S to 1 + fn unsafe_from_ascii(_b: u8) -> Self { + todo!() + } + + fn try_from_ascii(c: u8) -> Option { + match c { + b'S' | b'C' | b'G' => Some(Dna::S), + b'W' | b'A' | b'T' => Some(Dna::W), + _ => None, + } + } + + fn to_char(self) -> char { + match self { + Dna::S => 'S', + Dna::W => 'W', + } + } + + fn to_bits(self) -> u8 { + self as u8 + } + + fn items() -> impl Iterator { + vec![Dna::S, Dna::W].into_iter() + } +} + impl Complement for Dna { /// This representation collapses complements; `comp(x) == x` fn comp(&self) -> Self {