From 72a1893ee18188ca33c58b29c81cb7663ad9ec2b Mon Sep 17 00:00:00 2001 From: SymmetricChaos <42520289+SymmetricChaos@users.noreply.github.com> Date: Sun, 17 Nov 2024 15:16:21 -0500 Subject: [PATCH] hugely simplified unicode --- codes/src/text_standards/unicode.rs | 108 ++++++++-------------------- src/code_panel/unicode_controls.rs | 13 ++-- 2 files changed, 34 insertions(+), 87 deletions(-) diff --git a/codes/src/text_standards/unicode.rs b/codes/src/text_standards/unicode.rs index 1f133409..0213a957 100644 --- a/codes/src/text_standards/unicode.rs +++ b/codes/src/text_standards/unicode.rs @@ -1,5 +1,5 @@ use itertools::Itertools; -use utils::text_functions::{u16_to_string, u32_to_string, u8_to_string, NumRep}; +use utils::byte_formatting::ByteFormat; use crate::{errors::CodeError, traits::Code}; @@ -12,99 +12,52 @@ pub enum UnicodeEncoding { pub struct Unicode { pub encoding: UnicodeEncoding, - pub mode: NumRep, + pub mode: ByteFormat, } impl Unicode { fn utf8_encode(&self, text: &str) -> Result { - Ok(text.bytes().map(|n| u8_to_string(n, self.mode)).join(" ")) + Ok(self.mode.byte_iter_to_text(text.bytes())) } fn utf16_encode(&self, text: &str) -> Result { - Ok(text - .encode_utf16() - .map(|n| u16_to_string(n, self.mode)) - .join(" ")) + Ok(self + .mode + .u16_slice_to_text_be(text.encode_utf16().collect_vec())) } fn utf32_encode(&self, text: &str) -> Result { - Ok(text - .chars() - .map(|c| u32::from(c)) - .map(|n| u32_to_string(n, self.mode)) - .join(" ")) + Ok(self + .mode + .u32_slice_to_text_be(text.chars().map(|c| u32::from(c)).collect_vec())) } fn utf8_decode(&self, text: &str) -> Result { - let chunks = text.split(" "); - let radix = self.mode.radix(); - let mut vec = Vec::with_capacity(chunks.clone().count()); - - for chunk in chunks { - match u8::from_str_radix(chunk, radix) { - Ok(n) => vec.push(n), - Err(_) => { - return Err(CodeError::Input(format!( - "CodeError decoding UTF-8, unable to parse string: {}", - chunk - ))) - } - } - } + let v = self + .mode + .text_to_bytes(text) + .map_err(|e| CodeError::Input(e.to_string()))?; - String::from_utf8(vec).map_err(|e| CodeError::Input(e.to_string())) + String::from_utf8(v).map_err(|e| CodeError::Input(e.to_string())) } fn utf16_decode(&self, text: &str) -> Result { - let chunks = text.split(" "); - let radix = self.mode.radix(); - let mut vec = Vec::with_capacity(chunks.clone().count()); - - for chunk in chunks { - match u16::from_str_radix(chunk, radix) { - Ok(n) => vec.push(n), - Err(_) => { - return Err(CodeError::Input(format!( - "CodeError decoding UTF-16, unable to parse string: {}", - chunk - ))) - } - } - } + let v = self + .mode + .text_to_u16_be(text) + .map_err(|e| CodeError::Input(e.to_string()))?; - String::from_utf16(&vec).map_err(|e| CodeError::Input(e.to_string())) + String::from_utf16(&v).map_err(|e| CodeError::Input(e.to_string())) } fn utf32_decode(&self, text: &str) -> Result { - let chunks = text.split(" "); - - let mut out = String::with_capacity(chunks.clone().count()); - - let radix = self.mode.radix(); - - for chunk in chunks { - match u32::from_str_radix(chunk, radix) { - Ok(n) => { - match char::from_u32(n) { - Some(c) => out.push(c), - None => { - return Err(CodeError::Input(format!( - "UTF-32 decoding CodeError, invalid input string: {}", - chunk - ))) - } - }; - } - Err(_) => { - return Err(CodeError::Input(format!( - "CodeError decoding UTF-32 unable to parse string: {}", - chunk - ))) - } - } - } - - Ok(out) + Ok(self + .mode + .text_to_u32_be(text) + .map_err(|e| CodeError::Input(e.to_string()))? + .into_iter() + .map(|n| char::from_u32(n).unwrap_or('�')) + .collect()) } } @@ -112,7 +65,7 @@ impl Default for Unicode { fn default() -> Self { Unicode { encoding: UnicodeEncoding::Utf8, - mode: NumRep::Binary, + mode: ByteFormat::Binary, } } } @@ -152,12 +105,7 @@ mod unicode_tests { ] { code.encoding = encoding; - for mode in [ - NumRep::Binary, - NumRep::Octal, - NumRep::Decimal, - NumRep::HexLower, - ] { + for mode in [ByteFormat::Binary, ByteFormat::Hex, ByteFormat::Base64] { code.mode = mode; let encoded = code .encode(PLAINTEXT) diff --git a/src/code_panel/unicode_controls.rs b/src/code_panel/unicode_controls.rs index 3e98cba0..fdb02ac8 100644 --- a/src/code_panel/unicode_controls.rs +++ b/src/code_panel/unicode_controls.rs @@ -1,7 +1,7 @@ use super::CodeFrame; use crate::ui_elements::UiElements; use codes::text_standards::unicode::{Unicode, UnicodeEncoding}; -use utils::text_functions::NumRep; +use utils::byte_formatting::ByteFormat; pub struct UnicodeFrame { code: Unicode, @@ -34,19 +34,18 @@ impl CodeFrame for UnicodeFrame { ui.add_space(8.0); match self.code.encoding { - UnicodeEncoding::Utf8 => ui.label("UTF-8 is the most widely used character encoding in the modern world, partly because it is a superset of ASCII the previous dominant standard, and is the recommended way of encoding Unicode. It is a variable length code that uses between one and four bytes per character. The one byte codes are equivalent to ASCII. The two byte codes cover most of the remaining world alphabets. The three byte codes contain the common CJK (Chinese, Japanese, Korean) characters. Finall the four byte codes are used for a huge variety of less common symbols include emoji, care CJK character, and other symbols."), + UnicodeEncoding::Utf8 => ui.label("UTF-8 is the most widely used character encoding in the modern world, partly because it is a superset of ASCII the previous dominant standard, and is the current recommended way of encoding Unicode. It is a variable length code that uses between one and four bytes per character. The one byte codes are equivalent to ASCII. The two byte codes cover most of the remaining world alphabets. The three byte codes contain the common CJK (Chinese, Japanese, Korean) characters. Finally the four byte codes are used for a huge variety of less common symbols include emoji, rare CJK character, and other symbols."), UnicodeEncoding::Utf16 => ui.label("UTF-16 was the previous recommended standard for encoding Unicode, mostly prominently used by Microsoft Windows which adopted it before the creation of UTF-8. The encoding is variable width using either one or two code units of sixteen bits each. The single code unit characters cover all commonly used characters in world languages while less common symbols require two code units."), - UnicodeEncoding::Utf32 => ui.label("UTF-32 is the simple encoding of Unicode as it assigns 32 bits per character, representing the character's numeric value in Unicode. The first eleven bits are always zero as there are only 2^21 possible Unicode characters. Because of the large size and wasted space UTF-32 is rarely used for encoding text as a whole, rather it is used when representing individual characters on their own."), + UnicodeEncoding::Utf32 => ui.label("UTF-32 is the simplest encoding of Unicode as it assigns 32 bits per character, representing the character's numeric value in Unicode. However it is quite inefficient as the first eleven bits are always zero because there are only 2^21 possible Unicode characters. Because of the large size and wasted space UTF-32 is rarely used for encoding text as a whole, but it is used when representing individual characters on their own."), }; ui.add_space(16.0); ui.group(|ui| { ui.subheading("Representation"); ui.horizontal(|ui| { - ui.selectable_value(&mut self.code.mode, NumRep::Binary, "Binary"); - ui.selectable_value(&mut self.code.mode, NumRep::Octal, "Octal"); - ui.selectable_value(&mut self.code.mode, NumRep::Decimal, "Decimal"); - ui.selectable_value(&mut self.code.mode, NumRep::HexLower, "Hexadecimal"); + ui.selectable_value(&mut self.code.mode, ByteFormat::Binary, "Binary"); + ui.selectable_value(&mut self.code.mode, ByteFormat::Hex, "Hexadecimal"); + ui.selectable_value(&mut self.code.mode, ByteFormat::Base64, "Base64"); }); }); ui.add_space(16.0);