From 497d3907278dcc0c8f6a94ae14ffca7ad828650b Mon Sep 17 00:00:00 2001 From: cdecompilador Date: Thu, 25 Apr 2024 12:18:10 +0200 Subject: [PATCH 1/2] architecture change: parser combinators --- src/main.rs | 139 +++--------------------------- src/parser.rs | 20 +++++ src/parser/ast.rs | 1 + src/parser/combinators.rs | 174 ++++++++++++++++++++++++++++++++++++++ src/parser/cst.rs | 1 + src/parser/error.rs | 16 ++++ src/parser/lex.rs | 16 ++++ 7 files changed, 239 insertions(+), 128 deletions(-) create mode 100644 src/parser.rs create mode 100644 src/parser/ast.rs create mode 100644 src/parser/combinators.rs create mode 100644 src/parser/cst.rs create mode 100644 src/parser/error.rs create mode 100644 src/parser/lex.rs diff --git a/src/main.rs b/src/main.rs index bac6549..3d8edfb 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,144 +1,27 @@ -use std::cmp::min; -use std::fmt::{Debug, Display, Formatter}; -use regex::{Captures, Regex}; +use std::fmt::Debug; +mod parser; -#[derive(Debug, PartialEq)] -enum ParserError<'a> { - LiteralError(&'static str, &'a str), - RegexError(&'static str, &'a str), - SomeError(Vec>, Vec>), -} - -type ParserResult<'a, O> = Result<(O, &'a str), Vec>>; - -trait Parser<'a, O> { - fn parse(&self, input: &'a str) -> ParserResult<'a, O>; -} - -impl<'a, F, O> Parser<'a, O> for F -where - F: Fn(&'a str) -> ParserResult<'a, O>, -{ - fn parse(&self, input: &'a str) -> ParserResult<'a, O> { - self(input) - } -} - -fn pair<'a, O1, O2>(first: impl Parser<'a, O1>, second: impl Parser<'a, O2>) -> impl Parser<'a, (O1, O2)> { - move |input: &'a str| { - match first.parse(input) { - Ok((first_result, rest)) => { - match second.parse(rest) { - Ok((second_result, rest)) => { - Ok(((first_result, second_result), rest)) - } - Err(errors) => Err(errors), - } - } - Err(errors) => Err(errors), - } - } -} +use crate::parser::combinators::*; -fn some<'a, O>(first: impl Parser<'a, O>, second: impl Parser<'a, O>) -> impl Parser<'a, O> { - move |input: &'a str| { - match first.parse(input) { - Ok((result, rest)) => Ok((result, rest)), - Err(first_errors) => { - match second.parse(input) { - Ok((result, rest)) => Ok((result, rest)), - Err(second_errors) => { - Err(vec![ParserError::SomeError(first_errors, second_errors)]) - } - } - }, - } - } -} - -fn map<'a, I, O>(parser: impl Parser<'a, I>, f: impl Fn(I) -> O) -> impl Parser<'a, O> { - move |input: &'a str| { - match parser.parse(input) { - Ok((result, rest)) => Ok((f(result), rest)), - Err(errors) => Err(errors), - } - } -} - -fn literal<'a>(literal: &'static str) -> impl Parser<'a, &'static str> { - move |input: &'a str| { - if input.starts_with(literal) { - Ok((literal, &input[literal.len()..])) - } else { - Err(vec![ParserError::LiteralError(literal, &input[0..min(literal.len(), input.len())])]) - } - } -} - -fn regex<'a>(re_str: &'static str) -> impl Parser<'a, Captures<'a>> { - let re = Regex::new(re_str).unwrap(); - move |input: &'a str| { - match re.captures(input) { - Some(captures) => { - let match_length = captures.get(0).unwrap().end(); - Ok((captures, &input[match_length..])) - }, - None => { - let next_line = input.find(&['\n', '\r']).unwrap_or(min(10, input.len())); - Err(vec![ParserError::RegexError(re_str, &input[0..next_line])]) - }, - } - } -} - -fn execute<'a, O: Debug>(parser: impl Parser<'a, O>, input: &'a str) { +fn execute<'a, O: Debug, Err: Debug>(parser: impl parser::Parser<'a, O, Err>, input: &'a str) { match parser.parse(input) { Ok((result, rest)) => { println!("{:?} {:?}", result, rest); } - Err(errors) => { - for error in errors { - println!("{:?}", error); - } + Err(err) => { + println!("{:?}", err); } } } fn main() { - let mut file = std::fs::File::open("main.osta").unwrap(); + let file = std::fs::File::open("main.osta").unwrap(); let mmap = unsafe { memmap2::Mmap::map(&file).unwrap() }; let input = unsafe { std::str::from_utf8_unchecked(&mmap) }; - execute(pair(literal("hello"), pair(regex(r"\s+"), literal("world"))), input); -} - -#[cfg(test)] -mod tests { - use crate::ParserError::SomeError; - use super::*; - - #[test] - fn test_pair() { - let parser = pair(literal("foo"), literal("bar")); - assert_eq!(parser.parse("foobar"), Ok((("foo", "bar"), ""))); - assert_eq!(parser.parse("foobarbaz"), Ok((("foo", "bar"), "baz"))); - assert_eq!(parser.parse("foo"), Err(vec![ParserError::LiteralError("bar", "")])); - assert_eq!(parser.parse("bar"), Err(vec![ParserError::LiteralError("foo", "bar")])); - } - - #[test] - fn test_some() { - let parser = some(literal("foo"), literal("bar")); - assert_eq!(parser.parse("foo"), Ok(("foo", ""))); - assert_eq!(parser.parse("bar"), Ok(("bar", ""))); - assert_eq!(parser.parse("baz"), Err(vec![SomeError(vec![ParserError::LiteralError("foo", "baz")], vec![ParserError::LiteralError("bar", "baz")])])); - } - - #[test] - fn test_map() { - let parser = map(literal("foo"), |s| s.len()); - assert_eq!(parser.parse("foo"), Ok((3, ""))); - assert_eq!(parser.parse("bar"), Err(vec![ParserError::LiteralError("foo", "bar")])); - } + execute( + pair(literal("hello"), pair(regex(r"\s+"), literal("world"))), + input, + ); } diff --git a/src/parser.rs b/src/parser.rs new file mode 100644 index 0000000..42a3291 --- /dev/null +++ b/src/parser.rs @@ -0,0 +1,20 @@ +pub mod ast; +pub mod combinators; +pub mod cst; +pub mod error; +pub mod lex; + +pub type ParseResult<'a, Out, Err = Vec> = Result<(Out, &'a str), Err>; + +pub trait Parser<'a, Out, Err = Vec> { + fn parse(&self, input: &'a str) -> ParseResult<'a, Out, Err>; +} + +impl<'a, F, Out, Err> Parser<'a, Out, Err> for F +where + F: Fn(&'a str) -> ParseResult<'a, Out, Err>, +{ + fn parse(&self, input: &'a str) -> ParseResult<'a, Out, Err> { + self(input) + } +} diff --git a/src/parser/ast.rs b/src/parser/ast.rs new file mode 100644 index 0000000..beed323 --- /dev/null +++ b/src/parser/ast.rs @@ -0,0 +1 @@ +pub enum Ast {} diff --git a/src/parser/combinators.rs b/src/parser/combinators.rs new file mode 100644 index 0000000..46ce46e --- /dev/null +++ b/src/parser/combinators.rs @@ -0,0 +1,174 @@ +//! This module contains the fundamental combinators used through all the parsing pipeline + +use regex::{Captures, Regex}; + +use super::*; + +#[derive(Debug, PartialEq)] +pub enum Either { + Left(L), + Right(R), +} + +pub fn pair<'a, Out1, Out2, Err1, Err2>( + first: impl Parser<'a, Out1, Err1>, + second: impl Parser<'a, Out2, Err2>, +) -> impl Parser<'a, (Out1, Out2), Either> { + move |input| match first.parse(input) { + Ok((first_result, rest)) => match second.parse(rest) { + Ok((second_result, rest)) => Ok(((first_result, second_result), rest)), + Err(err) => Err(Either::Right(err)), + }, + Err(err) => Err(Either::Left(err)), + } +} + +pub fn left<'a, Out1, Out2, Err1, Err2>( + parser: impl Parser<'a, (Out1, Out2), Either>, +) -> impl Parser<'a, Out1, Either> { + map(parser, |(left, _)| left) +} + +pub fn right<'a, Out1, Out2, Err1, Err2>( + parser: impl Parser<'a, (Out1, Out2), Either>, +) -> impl Parser<'a, Out2, Either> { + map(parser, |(_, right)| right) +} + +pub fn some<'a, Out1, Out2, Err1, Err2>( + first: impl Parser<'a, Out1, Err1>, + second: impl Parser<'a, Out2, Err2>, +) -> impl Parser<'a, Either, (Err1, Err2)> { + move |input| match first.parse(input) { + Ok((result1, rest1)) => Ok((Either::Left(result1), rest1)), + Err(err1) => match second.parse(input) { + Ok((result2, rest2)) => Ok((Either::Right(result2), rest2)), + Err(err2) => Err((err1, err2)), + }, + } +} + +pub fn map<'a, In, Out, Err>( + parser: impl Parser<'a, In, Err>, + f: impl Fn(In) -> Out, +) -> impl Parser<'a, Out, Err> { + move |input| match parser.parse(input) { + Ok((result, rest)) => Ok((f(result), rest)), + Err(errors) => Err(errors), + } +} + +pub fn map_err<'a, Out, InErr, OutErr>( + parser: impl Parser<'a, Out, InErr>, + f: impl Fn(InErr) -> OutErr, +) -> impl Parser<'a, Out, OutErr> { + move |input| match parser.parse(input) { + Ok((result, rest)) => Ok((result, rest)), + Err(error) => Err(f(error)), + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct LiteralError<'a> { + pub expected: &'static str, + pub found: &'a str, +} + +pub fn literal<'a>(literal: &'static str) -> impl Parser<'a, &'static str, LiteralError<'a>> { + move |input: &'a str| { + if let Some(rest) = input.strip_prefix(literal) { + Ok((literal, rest)) + } else { + Err(LiteralError { + expected: literal, + found: input, + }) + } + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct RegexError<'a> { + pub re: &'static str, + pub found: &'a str, +} + +pub fn regex<'a>(re_str: &'static str) -> impl Parser<'a, Captures<'a>, RegexError<'a>> { + let re = Regex::new(re_str).unwrap(); + move |input: &'a str| match re.captures(input) { + Some(captures) => { + let match_length = captures.get(0).unwrap().end(); + Ok((captures, &input[match_length..])) + } + None => Err(RegexError { + re: re_str, + found: input, + }), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_pair() { + let parser = pair(literal("foo"), literal("bar")); + assert_eq!(parser.parse("foobar"), Ok((("foo", "bar"), ""))); + assert_eq!(parser.parse("foobarbaz"), Ok((("foo", "bar"), "baz"))); + assert_eq!( + parser.parse("foo"), + Err(Either::Right(LiteralError { + found: "", + expected: "bar" + })) + ); + assert_eq!( + parser.parse("bar"), + Err(Either::Left(LiteralError { + found: "bar", + expected: "foo" + })) + ); + } + + #[test] + fn test_some() { + let parser = some(literal("foo"), literal("bar")); + assert_eq!(parser.parse("foo"), Ok((Either::Left("foo"), ""))); + assert_eq!(parser.parse("bar"), Ok((Either::Right("bar"), ""))); + assert_eq!( + parser.parse("baz"), + Err(( + LiteralError { + found: "baz", + expected: "foo" + }, + LiteralError { + found: "baz", + expected: "bar" + } + )) + ); + } + + #[test] + fn test_map() { + let parser = map(literal("foo"), |_| 1); + assert_eq!(parser.parse("foo"), Ok((1, ""))); + assert_eq!( + parser.parse("bar"), + Err(LiteralError { + found: "bar", + expected: "foo" + }) + ); + } + + #[test] + fn test_map_err() { + let parser = map_err(literal("foo"), |_| 1); + assert_eq!(parser.parse("foo"), Ok(("foo", ""))); + assert_eq!(parser.parse("bar"), Err(1)); + } +} diff --git a/src/parser/cst.rs b/src/parser/cst.rs new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/src/parser/cst.rs @@ -0,0 +1 @@ + diff --git a/src/parser/error.rs b/src/parser/error.rs new file mode 100644 index 0000000..d5c29ff --- /dev/null +++ b/src/parser/error.rs @@ -0,0 +1,16 @@ +pub enum ParseError { + LexError(LexError), + SintacticError(SintacticError), +} + +pub enum LexError { + ExpectedCharacter(char), + ExpectedPattern(String), + ExpectedOneOf(Vec), + InvalidIntLiteral, + UnclosedStringLiteral, +} + +pub enum SintacticError {} + +// TODO: Semantic error diff --git a/src/parser/lex.rs b/src/parser/lex.rs new file mode 100644 index 0000000..4b9624a --- /dev/null +++ b/src/parser/lex.rs @@ -0,0 +1,16 @@ +use crate::parser::error::ParseError; +use crate::parser::*; + +pub fn integer_literal(input: &str) -> ParseResult<'_, i64, ParseError> { + todo!() +} + +// DESIGN(cdecompilador): Should this really a &str?? or a Span?? or an allocated String?? +pub fn identifier(input: &str) -> ParseResult<'_, String, ParseError> { + todo!() +} + +// DESIGN(cdecompilador): Should this really a &str?? or a Span?? or an allocated String?? +pub fn string_literal(input: &str) -> ParseResult<'_, String, ParseError> { + todo!() +} From 1e6b4781a83e3001429e22867f2b72193183b8be Mon Sep 17 00:00:00 2001 From: cdecompilador Date: Thu, 25 Apr 2024 18:26:47 +0200 Subject: [PATCH 2/2] parser combinators: removed left & right --- src/parser/combinators.rs | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/src/parser/combinators.rs b/src/parser/combinators.rs index 46ce46e..bf2166a 100644 --- a/src/parser/combinators.rs +++ b/src/parser/combinators.rs @@ -23,18 +23,6 @@ pub fn pair<'a, Out1, Out2, Err1, Err2>( } } -pub fn left<'a, Out1, Out2, Err1, Err2>( - parser: impl Parser<'a, (Out1, Out2), Either>, -) -> impl Parser<'a, Out1, Either> { - map(parser, |(left, _)| left) -} - -pub fn right<'a, Out1, Out2, Err1, Err2>( - parser: impl Parser<'a, (Out1, Out2), Either>, -) -> impl Parser<'a, Out2, Either> { - map(parser, |(_, right)| right) -} - pub fn some<'a, Out1, Out2, Err1, Err2>( first: impl Parser<'a, Out1, Err1>, second: impl Parser<'a, Out2, Err2>,