From ffad81e842c6392999146df940399cd7704898c4 Mon Sep 17 00:00:00 2001 From: Dmitry Date: Sat, 23 Nov 2024 15:14:17 +0300 Subject: [PATCH] fix(unicode): use u8 under the hood (#4) Do not convert into strings prematurely. Use Vec --- src/lexer.rs | 59 ++++++++++++++++++++++++++++----------------------- src/main.rs | 9 ++++++-- src/parser.rs | 4 ++-- 3 files changed, 41 insertions(+), 31 deletions(-) diff --git a/src/lexer.rs b/src/lexer.rs index 1c780f6..565df43 100644 --- a/src/lexer.rs +++ b/src/lexer.rs @@ -1,3 +1,5 @@ +use std::borrow::Cow; + #[derive(Debug, Clone, Copy, Eq, PartialEq)] pub enum Paired { Bracket, // [] @@ -7,13 +9,13 @@ pub enum Paired { #[derive(Debug, Clone, Eq, PartialEq)] pub enum Lexem { - String(String), + String(Vec), Open(Paired), Close(Paired), Comma, Colon, - Else(String), - WhiteSpace(String), + Else(Vec), + WhiteSpace(Vec), } impl Default for Lexem { @@ -22,7 +24,7 @@ impl Default for Lexem { } } -fn fix_str(s: &str) -> String { +fn fix_str(s: Cow<'_, str>) -> Cow<'_, str> { Some('\"') .into_iter() .chain( @@ -35,14 +37,14 @@ fn fix_str(s: &str) -> String { .collect() } -fn fix_else(s: &str) -> String { +fn fix_else(s: Cow<'_, str>) -> Cow<'_, str> { match s.to_lowercase().as_str() { "null" | "nil" | "nul" | "none" => "null".into(), "true" => "true".into(), "false" => "false".into(), &_ => { if s.chars().all(|c| c.is_numeric()) { - s.into() + s } else { fix_str(s) } @@ -60,9 +62,12 @@ impl From for String { Lexem::Open(Paired::Brace) => "{".into(), Lexem::Close(Paired::Brace) => "}".into(), Lexem::Open(Paired::File) | Lexem::Close(Paired::File) => "".into(), - Lexem::Else(s) => fix_else(&s), - Lexem::String(s) => fix_str(s.get(1..s.len() - 1).unwrap_or_default()), - Lexem::WhiteSpace(s) => s, + Lexem::Else(s) => fix_else(String::from_utf8_lossy(&s)).into_owned(), + Lexem::String(s) => fix_str(String::from_utf8_lossy( + s.get(1..s.len() - 1).unwrap_or_default(), + )) + .into_owned(), + Lexem::WhiteSpace(s) => String::from_utf8_lossy(&s).into_owned(), } } } @@ -73,42 +78,42 @@ pub struct Lexer { } impl Lexer { - pub fn process(&mut self, character: char) -> Option { + pub fn process(&mut self, character: u8) -> Option { if let Some(Lexem::String(s)) = &mut self.state { - let first_char = s.chars().next().unwrap_or_default(); - let last_char = s.chars().last().unwrap_or_default(); + let first_char = s.iter().next().cloned().unwrap_or_default(); + let last_char = s.iter().last().cloned().unwrap_or_default(); s.push(character); - if last_char != '\\' && character == first_char { + if last_char != b'\\' && character == first_char { return std::mem::take(&mut self.state); } return None; } let next = match character { - '[' => Lexem::Open(Paired::Bracket), - ']' => Lexem::Close(Paired::Bracket), - '(' => Lexem::Open(Paired::Bracket), - ')' => Lexem::Close(Paired::Bracket), - '{' => Lexem::Open(Paired::Brace), - '}' => Lexem::Close(Paired::Brace), - '\0' => Lexem::Close(Paired::File), - ',' => Lexem::Comma, - ':' | '=' => Lexem::Colon, - '"' | '\'' | '`' => { - return std::mem::replace(&mut self.state, Some(Lexem::String(character.into()))); + b'[' => Lexem::Open(Paired::Bracket), + b']' => Lexem::Close(Paired::Bracket), + b'(' => Lexem::Open(Paired::Bracket), + b')' => Lexem::Close(Paired::Bracket), + b'{' => Lexem::Open(Paired::Brace), + b'}' => Lexem::Close(Paired::Brace), + b'\0' => Lexem::Close(Paired::File), + b',' => Lexem::Comma, + b':' | b'=' => Lexem::Colon, + b'"' | b'\'' | b'`' => { + return std::mem::replace(&mut self.state, Some(Lexem::String(vec![character]))); } _ => { - if character.is_whitespace() { + if character.is_ascii_whitespace() { if let Some(Lexem::WhiteSpace(s)) = &mut self.state { s.push(character); return None; } - Lexem::WhiteSpace(character.into()) + Lexem::WhiteSpace(vec![character]) } else { if let Some(Lexem::Else(s)) = &mut self.state { s.push(character); return None; } - Lexem::Else(character.into()) + Lexem::Else(vec![character]) } } }; diff --git a/src/main.rs b/src/main.rs index 18a97e3..5412e27 100644 --- a/src/main.rs +++ b/src/main.rs @@ -17,8 +17,7 @@ where let mut parser = Parser::default(); ChunkReader::new(reader, chunk_size) .flatten() - .map(|c| c as char) - .chain(Some('\0')) + .chain(Some(0)) .filter_map(|c| lexer.process(c)) .chain([Default::default(), Default::default()]) .flat_map(|l| parser.parse(l)) @@ -48,6 +47,12 @@ mod tests { assert_eq!("", process("")); } + #[test] + fn non_unicode() { + let value = r#"{"некоторое":"значение"}"#; + assert_eq!(value, process(value)); + } + #[test] fn valid() { let value = r#"{"a":3,"b": 4}"#; diff --git a/src/parser.rs b/src/parser.rs index c897f26..25638dc 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -131,8 +131,8 @@ fn validate(state: State, lexem: Lexem) -> Validate { impl Parser { pub fn parse(&mut self, lexem: Lexem) -> Vec { let mut result = Vec::new(); - if let Lexem::WhiteSpace(s) = lexem { - self.whitespace.push_str(s.as_str()); + if let Lexem::WhiteSpace(_) = lexem { + self.whitespace.push_str(&String::from(lexem)); return result; } let mut tokens = vec![Token::new(lexem, std::mem::take(&mut self.whitespace))];