Skip to content

Commit

Permalink
fix(unicode): use u8 under the hood
Browse files Browse the repository at this point in the history
  • Loading branch information
pinbraerts committed Nov 23, 2024
1 parent ffa8d4c commit 8c92d50
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 30 deletions.
57 changes: 31 additions & 26 deletions src/lexer.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
use std::borrow::Cow;

#[derive(Debug, Clone, Copy, Eq, PartialEq)]
pub enum Paired {
Bracket, // []
Expand All @@ -7,13 +9,13 @@ pub enum Paired {

#[derive(Debug, Clone, Eq, PartialEq)]
pub enum Lexem {
String(String),
String(Vec<u8>),
Open(Paired),
Close(Paired),
Comma,
Colon,
Else(String),
WhiteSpace(String),
Else(Vec<u8>),
WhiteSpace(Vec<u8>),
}

impl Default for Lexem {
Expand All @@ -22,7 +24,7 @@ impl Default for Lexem {
}
}

fn fix_str(s: &str) -> String {
fn fix_str(s: Cow<'_, str>) -> Cow<'_, str> {
Some('\"')
.into_iter()
.chain(
Expand All @@ -35,7 +37,7 @@ fn fix_str(s: &str) -> String {
.collect()
}

fn fix_else(s: &str) -> String {
fn fix_else(s: Cow<'_, str>) -> Cow<'_, str> {
match s.to_lowercase().as_str() {
"null" | "nil" | "nul" | "none" => "null".into(),
"true" => "true".into(),
Expand All @@ -60,9 +62,12 @@ impl From<Lexem> for String {
Lexem::Open(Paired::Brace) => "{".into(),
Lexem::Close(Paired::Brace) => "}".into(),
Lexem::Open(Paired::File) | Lexem::Close(Paired::File) => "".into(),
Lexem::Else(s) => fix_else(&s),
Lexem::String(s) => fix_str(s.get(1..s.len() - 1).unwrap_or_default()),
Lexem::WhiteSpace(s) => s,
Lexem::Else(s) => fix_else(String::from_utf8_lossy(&s)).into_owned(),
Lexem::String(s) => fix_str(String::from_utf8_lossy(
s.get(1..s.len() - 1).unwrap_or_default(),
))
.into_owned(),
Lexem::WhiteSpace(s) => String::from_utf8_lossy(&s).into_owned(),
}
}
}
Expand All @@ -73,42 +78,42 @@ pub struct Lexer {
}

impl Lexer {
pub fn process(&mut self, character: char) -> Option<Lexem> {
pub fn process(&mut self, character: u8) -> Option<Lexem> {
if let Some(Lexem::String(s)) = &mut self.state {
let first_char = s.chars().next().unwrap_or_default();
let last_char = s.chars().last().unwrap_or_default();
let first_char = s.iter().next().cloned().unwrap_or_default();
let last_char = s.iter().last().cloned().unwrap_or_default();
s.push(character);
if last_char != '\\' && character == first_char {
if last_char != b'\\' && character == first_char {
return std::mem::take(&mut self.state);
}
return None;
}
let next = match character {
'[' => Lexem::Open(Paired::Bracket),
']' => Lexem::Close(Paired::Bracket),
'(' => Lexem::Open(Paired::Bracket),
')' => Lexem::Close(Paired::Bracket),
'{' => Lexem::Open(Paired::Brace),
'}' => Lexem::Close(Paired::Brace),
'\0' => Lexem::Close(Paired::File),
',' => Lexem::Comma,
':' | '=' => Lexem::Colon,
'"' | '\'' | '`' => {
return std::mem::replace(&mut self.state, Some(Lexem::String(character.into())));
b'[' => Lexem::Open(Paired::Bracket),
b']' => Lexem::Close(Paired::Bracket),
b'(' => Lexem::Open(Paired::Bracket),
b')' => Lexem::Close(Paired::Bracket),
b'{' => Lexem::Open(Paired::Brace),
b'}' => Lexem::Close(Paired::Brace),
b'\0' => Lexem::Close(Paired::File),
b',' => Lexem::Comma,
b':' | b'=' => Lexem::Colon,
b'"' | b'\'' | b'`' => {
return std::mem::replace(&mut self.state, Some(Lexem::String(vec![character])));
}
_ => {
if character.is_whitespace() {
if character.is_ascii_whitespace() {
if let Some(Lexem::WhiteSpace(s)) = &mut self.state {
s.push(character);
return None;
}
Lexem::WhiteSpace(character.into())
Lexem::WhiteSpace(vec![character])
} else {
if let Some(Lexem::Else(s)) = &mut self.state {
s.push(character);
return None;
}
Lexem::Else(character.into())
Lexem::Else(vec![character])
}
}
};
Expand Down
9 changes: 7 additions & 2 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,7 @@ where
let mut parser = Parser::default();
ChunkReader::new(reader, chunk_size)
.flatten()
.map(|c| c as char)
.chain(Some('\0'))
.chain(Some(0))
.filter_map(|c| lexer.process(c))
.chain([Default::default(), Default::default()])
.flat_map(|l| parser.parse(l))
Expand Down Expand Up @@ -48,6 +47,12 @@ mod tests {
assert_eq!("", process(""));
}

#[test]
fn non_unicode() {
let value = r#"{"некоторое":"значение"}"#;
assert_eq!(value, process(value));
}

#[test]
fn valid() {
let value = r#"{"a":3,"b": 4}"#;
Expand Down
4 changes: 2 additions & 2 deletions src/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -131,8 +131,8 @@ fn validate(state: State, lexem: Lexem) -> Validate {
impl Parser {
pub fn parse(&mut self, lexem: Lexem) -> Vec<Token> {
let mut result = Vec::new();
if let Lexem::WhiteSpace(s) = lexem {
self.whitespace.push_str(s.as_str());
if let Lexem::WhiteSpace(_) = lexem {
self.whitespace.push_str(&String::from(lexem));
return result;
}
let mut tokens = vec![Token::new(lexem, std::mem::take(&mut self.whitespace))];
Expand Down

0 comments on commit 8c92d50

Please sign in to comment.