From 209536269d1fc6f0ddc7a7ff261ac5776b035d49 Mon Sep 17 00:00:00 2001 From: jeff-k Date: Fri, 31 May 2024 19:21:35 +0100 Subject: [PATCH] migrate codecs to new Codec trait def --- bio-seq/src/codec/amino.rs | 12 +++++++++++ bio-seq/src/codec/dna.rs | 2 +- bio-seq/src/codec/iupac.rs | 31 +++++++++++++++++++++++++++ bio-seq/src/codec/text.rs | 43 ++++++++++++++++++++++++++------------ bio-seq/src/lib.rs | 11 +++++----- bio-seq/src/seq/mod.rs | 26 +++++++++++------------ 6 files changed, 91 insertions(+), 34 deletions(-) diff --git a/bio-seq/src/codec/amino.rs b/bio-seq/src/codec/amino.rs index 1056b22..6aa1bdb 100644 --- a/bio-seq/src/codec/amino.rs +++ b/bio-seq/src/codec/amino.rs @@ -51,6 +51,18 @@ pub enum Amino { X = 0b000011, // TAA (stop) } +impl From for u8 { + fn from(b: Amino) -> u8 { + b as u8 + } +} + +impl core::fmt::Display for Amino { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.to_char()) + } +} + #[macro_export] macro_rules! amino { ($seq:expr) => { diff --git a/bio-seq/src/codec/dna.rs b/bio-seq/src/codec/dna.rs index d3ee2a1..7f6ceb1 100644 --- a/bio-seq/src/codec/dna.rs +++ b/bio-seq/src/codec/dna.rs @@ -69,7 +69,7 @@ impl Codec for Dna { impl Complement for Dna { /// This 2-bit representation of nucleotides lends itself to a very fast /// complement implementation with bitwise xor - fn to_comp(&self) -> Self { + fn comp(&self) -> Self { // flip the bits let b = *self as u8 ^ 0b11; Dna::unsafe_from_bits(b) diff --git a/bio-seq/src/codec/iupac.rs b/bio-seq/src/codec/iupac.rs index 7e41660..c26643c 100644 --- a/bio-seq/src/codec/iupac.rs +++ b/bio-seq/src/codec/iupac.rs @@ -55,6 +55,37 @@ use crate::seq::{Seq, SeqSlice}; use core::ops::{BitAnd, BitOr}; +/* +const LTABLE: [u8; 256] = { + let mut table = [0; 256]; + + table[b'A' as usize] = 0b1000; + table[b'C' as usize] = 0b0100; + table[b'G' as usize] = 0b0010; + table[b'T' as usize] = 0b0001; + table[b'R' as usize] = 0b1010; + table[b'Y' as usize] = 0b0101; + table[b'S' as usize] = 0b0110; + table[b'W' as usize] = 0b1001; + table[b'K' as usize] = 0b0011; + table[b'M' as usize] = 0b1100; + table[b'B' as usize] = 0b0111; + table[b'D' as usize] = 0b1011; + table[b'H' as usize] = 0b1101; + table[b'V' as usize] = 0b1110; + table[b'N' as usize] = 0b1111; + table[b'-' as usize] = 0b0000; + + table +}; +*/ + +impl From for u8 { + fn from(b: Iupac) -> u8 { + b as u8 + } +} + #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Codec)] #[bits(4)] #[repr(u8)] diff --git a/bio-seq/src/codec/text.rs b/bio-seq/src/codec/text.rs index 4469b81..94ed25d 100644 --- a/bio-seq/src/codec/text.rs +++ b/bio-seq/src/codec/text.rs @@ -1,4 +1,4 @@ -//! 8-bit UTF-8/ASCII representation of nucleotides +//! 8-bit ASCII representation of nucleotides //! //! This encoding is a literal interpretation of bytes of text as DNA use crate::codec::{dna, Codec, Complement}; @@ -9,28 +9,35 @@ use crate::error::ParseBioError; pub struct Dna(u8); impl Codec for Dna { - type Error = ParseBioError; - - const BITS: usize = 8; + const BITS: u8 = 8; fn unsafe_from_bits(b: u8) -> Self { Self(b) } - fn try_from_bits(b: u8) -> Result { - Ok(Self(b)) + fn try_from_bits(b: u8) -> Option { + Some(Self(b)) + } + + fn unsafe_from_ascii(c: u8) -> Self { + Self(c) } - fn from_char(c: char) -> Result { - match u8::try_from(c) { - Ok(b) => Ok(Self(b)), - _ => Err(Self::Error {}), + fn try_from_ascii(c: u8) -> Option { + // if c.is_ascii_alphanumeric() { + match c { + b'A' | b'C' | b'G' | b'T' | b'N' => Some(Self(c)), + _ => None, } } fn to_char(self) -> char { self.0.into() } + + fn items() -> impl Iterator { + vec![Dna(b'A'), Dna(b'C'), Dna(b'G'), Dna(b'T'), Dna(b'N')].into_iter() + } } //impl Eq for Dna { @@ -40,8 +47,15 @@ impl Codec for Dna { //} impl Complement for Dna { - fn comp(self) -> Self { - unimplemented!() + fn comp(&self) -> Self { + match self { + Self(b'A') => Self(b'T'), + Self(b'C') => Self(b'G'), + Self(b'G') => Self(b'C'), + Self(b'T') => Self(b'A'), + Self(b'N') => Self(b'N'), + _ => Self(b'N'), + } } } @@ -71,11 +85,14 @@ impl TryFrom for dna::Dna { b'C' => Ok(dna::Dna::C), b'G' => Ok(dna::Dna::G), b'T' => Ok(dna::Dna::T), + // Todo: decide whether to support lower cases + /* b'a' => Ok(dna::Dna::A), b'c' => Ok(dna::Dna::C), b'g' => Ok(dna::Dna::G), b't' => Ok(dna::Dna::T), - _ => Err(ParseBioError {}), + */ + _ => Err(ParseBioError::UnrecognisedBase(base.0)), } } } diff --git a/bio-seq/src/lib.rs b/bio-seq/src/lib.rs index f2177e3..9d20575 100644 --- a/bio-seq/src/lib.rs +++ b/bio-seq/src/lib.rs @@ -54,20 +54,19 @@ pub mod codec; pub mod error; pub mod kmer; pub mod seq; -//pub mod translation; +pub mod translation; pub mod prelude { - // pub use crate::codec::amino::Amino; + pub use crate::codec::amino::Amino; pub use crate::codec::dna::Dna; - // pub use crate::codec::iupac::Iupac; + pub use crate::codec::iupac::Iupac; pub use crate::codec::{Codec, Complement}; pub use crate::kmer::Kmer; pub use crate::seq::{ReverseComplement, Seq, SeqSlice}; - // pub use crate::{amino, dna, iupac, kmer}; - pub use crate::{dna, kmer}; + pub use crate::{amino, dna, iupac, kmer}; - // pub use crate::translation; + pub use crate::translation; pub use core::str::FromStr; pub use crate::error::ParseBioError; diff --git a/bio-seq/src/seq/mod.rs b/bio-seq/src/seq/mod.rs index ce39d86..a4b4a23 100644 --- a/bio-seq/src/seq/mod.rs +++ b/bio-seq/src/seq/mod.rs @@ -29,8 +29,8 @@ pub mod index; pub mod iterators; -use crate::codec::{Codec, Complement}; -//use crate::codec::{text, Codec, Complement}; +//use crate::codec::{Codec, Complement}; +use crate::codec::{text, Codec, Complement}; use crate::error::ParseBioError; use crate::{Bs, Bv, Order}; @@ -62,17 +62,6 @@ pub struct SeqSlice { bs: Bs, } -/* -impl From> for Seq { - fn from(vec: Vec) -> Self { - Seq { - _p: PhantomData, - bv: Bv::from_vec(vec), - } - } -} - */ - impl From> for usize { fn from(slice: Seq) -> usize { assert!(slice.bv.len() <= usize::BITS as usize); @@ -113,7 +102,7 @@ impl ReverseComplement for SeqSlice { /// The inefficient default complementation of complement fn revcomp(&self) -> Seq { let mut seq = Seq::::with_capacity(self.len()); - seq.extend(self.rev().map(|base| base.to_comp())); + seq.extend(self.rev().map(|base| base.comp())); seq } } @@ -516,6 +505,15 @@ impl FromIterator for Seq { } } +impl From> for Seq { + fn from(vec: Vec) -> Self { + Seq { + _p: PhantomData, + bv: Bv::from_vec(vec), + } + } +} + #[cfg(test)] mod tests { use crate::prelude::*;