Skip to content

Commit

Permalink
hugely simplified unicode
Browse files Browse the repository at this point in the history
  • Loading branch information
SymmetricChaos committed Nov 17, 2024
1 parent 2c9582f commit 72a1893
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 87 deletions.
108 changes: 28 additions & 80 deletions codes/src/text_standards/unicode.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use itertools::Itertools;
use utils::text_functions::{u16_to_string, u32_to_string, u8_to_string, NumRep};
use utils::byte_formatting::ByteFormat;

use crate::{errors::CodeError, traits::Code};

Expand All @@ -12,107 +12,60 @@ pub enum UnicodeEncoding {

pub struct Unicode {
pub encoding: UnicodeEncoding,
pub mode: NumRep,
pub mode: ByteFormat,
}

impl Unicode {
fn utf8_encode(&self, text: &str) -> Result<String, CodeError> {
Ok(text.bytes().map(|n| u8_to_string(n, self.mode)).join(" "))
Ok(self.mode.byte_iter_to_text(text.bytes()))
}

fn utf16_encode(&self, text: &str) -> Result<String, CodeError> {
Ok(text
.encode_utf16()
.map(|n| u16_to_string(n, self.mode))
.join(" "))
Ok(self
.mode
.u16_slice_to_text_be(text.encode_utf16().collect_vec()))
}

fn utf32_encode(&self, text: &str) -> Result<String, CodeError> {
Ok(text
.chars()
.map(|c| u32::from(c))
.map(|n| u32_to_string(n, self.mode))
.join(" "))
Ok(self
.mode
.u32_slice_to_text_be(text.chars().map(|c| u32::from(c)).collect_vec()))
}

fn utf8_decode(&self, text: &str) -> Result<String, CodeError> {
let chunks = text.split(" ");
let radix = self.mode.radix();
let mut vec = Vec::with_capacity(chunks.clone().count());

for chunk in chunks {
match u8::from_str_radix(chunk, radix) {
Ok(n) => vec.push(n),
Err(_) => {
return Err(CodeError::Input(format!(
"CodeError decoding UTF-8, unable to parse string: {}",
chunk
)))
}
}
}
let v = self
.mode
.text_to_bytes(text)
.map_err(|e| CodeError::Input(e.to_string()))?;

String::from_utf8(vec).map_err(|e| CodeError::Input(e.to_string()))
String::from_utf8(v).map_err(|e| CodeError::Input(e.to_string()))
}

fn utf16_decode(&self, text: &str) -> Result<String, CodeError> {
let chunks = text.split(" ");
let radix = self.mode.radix();
let mut vec = Vec::with_capacity(chunks.clone().count());

for chunk in chunks {
match u16::from_str_radix(chunk, radix) {
Ok(n) => vec.push(n),
Err(_) => {
return Err(CodeError::Input(format!(
"CodeError decoding UTF-16, unable to parse string: {}",
chunk
)))
}
}
}
let v = self
.mode
.text_to_u16_be(text)
.map_err(|e| CodeError::Input(e.to_string()))?;

String::from_utf16(&vec).map_err(|e| CodeError::Input(e.to_string()))
String::from_utf16(&v).map_err(|e| CodeError::Input(e.to_string()))
}

fn utf32_decode(&self, text: &str) -> Result<String, CodeError> {
let chunks = text.split(" ");

let mut out = String::with_capacity(chunks.clone().count());

let radix = self.mode.radix();

for chunk in chunks {
match u32::from_str_radix(chunk, radix) {
Ok(n) => {
match char::from_u32(n) {
Some(c) => out.push(c),
None => {
return Err(CodeError::Input(format!(
"UTF-32 decoding CodeError, invalid input string: {}",
chunk
)))
}
};
}
Err(_) => {
return Err(CodeError::Input(format!(
"CodeError decoding UTF-32 unable to parse string: {}",
chunk
)))
}
}
}

Ok(out)
Ok(self
.mode
.text_to_u32_be(text)
.map_err(|e| CodeError::Input(e.to_string()))?
.into_iter()
.map(|n| char::from_u32(n).unwrap_or('�'))
.collect())
}
}

impl Default for Unicode {
fn default() -> Self {
Unicode {
encoding: UnicodeEncoding::Utf8,
mode: NumRep::Binary,
mode: ByteFormat::Binary,
}
}
}
Expand Down Expand Up @@ -152,12 +105,7 @@ mod unicode_tests {
] {
code.encoding = encoding;

for mode in [
NumRep::Binary,
NumRep::Octal,
NumRep::Decimal,
NumRep::HexLower,
] {
for mode in [ByteFormat::Binary, ByteFormat::Hex, ByteFormat::Base64] {
code.mode = mode;
let encoded = code
.encode(PLAINTEXT)
Expand Down
13 changes: 6 additions & 7 deletions src/code_panel/unicode_controls.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use super::CodeFrame;
use crate::ui_elements::UiElements;
use codes::text_standards::unicode::{Unicode, UnicodeEncoding};
use utils::text_functions::NumRep;
use utils::byte_formatting::ByteFormat;

pub struct UnicodeFrame {
code: Unicode,
Expand Down Expand Up @@ -34,19 +34,18 @@ impl CodeFrame for UnicodeFrame {
ui.add_space(8.0);

match self.code.encoding {
UnicodeEncoding::Utf8 => ui.label("UTF-8 is the most widely used character encoding in the modern world, partly because it is a superset of ASCII the previous dominant standard, and is the recommended way of encoding Unicode. It is a variable length code that uses between one and four bytes per character. The one byte codes are equivalent to ASCII. The two byte codes cover most of the remaining world alphabets. The three byte codes contain the common CJK (Chinese, Japanese, Korean) characters. Finall the four byte codes are used for a huge variety of less common symbols include emoji, care CJK character, and other symbols."),
UnicodeEncoding::Utf8 => ui.label("UTF-8 is the most widely used character encoding in the modern world, partly because it is a superset of ASCII the previous dominant standard, and is the current recommended way of encoding Unicode. It is a variable length code that uses between one and four bytes per character. The one byte codes are equivalent to ASCII. The two byte codes cover most of the remaining world alphabets. The three byte codes contain the common CJK (Chinese, Japanese, Korean) characters. Finally the four byte codes are used for a huge variety of less common symbols include emoji, rare CJK character, and other symbols."),
UnicodeEncoding::Utf16 => ui.label("UTF-16 was the previous recommended standard for encoding Unicode, mostly prominently used by Microsoft Windows which adopted it before the creation of UTF-8. The encoding is variable width using either one or two code units of sixteen bits each. The single code unit characters cover all commonly used characters in world languages while less common symbols require two code units."),
UnicodeEncoding::Utf32 => ui.label("UTF-32 is the simple encoding of Unicode as it assigns 32 bits per character, representing the character's numeric value in Unicode. The first eleven bits are always zero as there are only 2^21 possible Unicode characters. Because of the large size and wasted space UTF-32 is rarely used for encoding text as a whole, rather it is used when representing individual characters on their own."),
UnicodeEncoding::Utf32 => ui.label("UTF-32 is the simplest encoding of Unicode as it assigns 32 bits per character, representing the character's numeric value in Unicode. However it is quite inefficient as the first eleven bits are always zero because there are only 2^21 possible Unicode characters. Because of the large size and wasted space UTF-32 is rarely used for encoding text as a whole, but it is used when representing individual characters on their own."),
};
ui.add_space(16.0);

ui.group(|ui| {
ui.subheading("Representation");
ui.horizontal(|ui| {
ui.selectable_value(&mut self.code.mode, NumRep::Binary, "Binary");
ui.selectable_value(&mut self.code.mode, NumRep::Octal, "Octal");
ui.selectable_value(&mut self.code.mode, NumRep::Decimal, "Decimal");
ui.selectable_value(&mut self.code.mode, NumRep::HexLower, "Hexadecimal");
ui.selectable_value(&mut self.code.mode, ByteFormat::Binary, "Binary");
ui.selectable_value(&mut self.code.mode, ByteFormat::Hex, "Hexadecimal");
ui.selectable_value(&mut self.code.mode, ByteFormat::Base64, "Base64");
});
});
ui.add_space(16.0);
Expand Down

0 comments on commit 72a1893

Please sign in to comment.