Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(parser): the 1st version of a new combinator style parser #16876

Merged
merged 39 commits into from
May 24, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
64132ed
feat(parser): the 1st version of a new parser-combinator style parser
TennyZhuang May 21, 2024
f68d97e
add parse_v2
TennyZhuang May 21, 2024
12bfadf
Update src/sqlparser/src/parser_v2/data_type.rs
TennyZhuang May 21, 2024
b0a0afd
minor fix
TennyZhuang May 22, 2024
8ea355a
minor improvement
TennyZhuang May 22, 2024
332234b
minor fix
TennyZhuang May 22, 2024
2ccd163
add some label
TennyZhuang May 22, 2024
5a3b34d
introduce TokenStreamWrapper for better readability
TennyZhuang May 22, 2024
6c9683a
fix parse_v2
TennyZhuang May 22, 2024
905fa6a
handle whitespace
TennyZhuang May 22, 2024
80208ef
use preceed
TennyZhuang May 22, 2024
9c9472f
remove dbg
TennyZhuang May 22, 2024
2d6692c
fix custom data type
TennyZhuang May 22, 2024
7e8166d
fix array parsing
TennyZhuang May 23, 2024
686217c
add a context
TennyZhuang May 23, 2024
51f9265
fix unused import
TennyZhuang May 23, 2024
89f67e5
fix struct parsing
TennyZhuang May 23, 2024
6b62577
fix float parsing
TennyZhuang May 23, 2024
32d29db
fix precision parsing
TennyZhuang May 23, 2024
c76c314
add many comments
TennyZhuang May 23, 2024
d00ec9f
fix clippy
TennyZhuang May 23, 2024
60f2019
simplify error def
TennyZhuang May 23, 2024
3b366ab
fix precision parsing
TennyZhuang May 23, 2024
26c7499
fix double
TennyZhuang May 23, 2024
b26810d
fix custom type parsing
TennyZhuang May 23, 2024
5cdab19
1..54
TennyZhuang May 23, 2024
79ce4ea
support TEXT
TennyZhuang May 23, 2024
df6137b
fix
TennyZhuang May 23, 2024
4504419
fix ut
TennyZhuang May 23, 2024
0e239cc
refine error message
TennyZhuang May 23, 2024
9ba9e97
fix stateful parsing
TennyZhuang May 23, 2024
3cf20d1
refine error
TennyZhuang May 23, 2024
88562ff
fix ut
TennyZhuang May 23, 2024
8452dc7
fix struct sep parsing
TennyZhuang May 24, 2024
f9d93c7
Update src/sqlparser/src/parser_v2/number.rs
TennyZhuang May 24, 2024
162a663
address comments
TennyZhuang May 24, 2024
cc9dbc3
fix warning
TennyZhuang May 24, 2024
d8b7d60
fix
TennyZhuang May 24, 2024
b7d79e1
revert a behavior
TennyZhuang May 24, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 12 additions & 3 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions src/sqlparser/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ itertools = { workspace = true }
serde = { version = "1.0", features = ["derive"], optional = true }
tracing = "0.1"
tracing-subscriber = "0.3"
winnow = { version = "0.6.8", git = "https://github.com/TennyZhuang/winnow.git", rev = "509121449b451f0faa1b698b4e8cb36dfaa1a4b5" }

[target.'cfg(not(madsim))'.dependencies]
workspace-hack = { path = "../workspace-hack" }
Expand Down
1 change: 1 addition & 0 deletions src/sqlparser/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ extern crate alloc;
pub mod ast;
pub mod keywords;
pub mod parser;
pub mod parser_v2;
pub mod tokenizer;

#[doc(hidden)]
Expand Down
24 changes: 24 additions & 0 deletions src/sqlparser/src/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ use alloc::{
vec::Vec,
};
use core::fmt;
use std::fmt::format;

use itertools::Itertools;
use tracing::{debug, instrument};
Expand Down Expand Up @@ -187,6 +188,29 @@ impl Parser {
}
}

pub(crate) fn parse_v2<'a, O>(
&'a mut self,
mut parse_next: impl winnow::Parser<
winnow::Located<&'a [TokenWithLocation]>,
O,
winnow::error::ContextError,
>,
) -> Result<O, ParserError> {
use winnow::stream::Location;

let mut token_stream = winnow::Located::new(&*self.tokens);
let output = parse_next.parse_next(&mut token_stream).map_err(|e| {
ParserError::ParserError(format!(
"Error parsing SQL at {}: {}",
token_stream.location(),
e
))
});
let offset = token_stream.location();
self.index += offset;
output
}

/// Parse a SQL statement and produce an Abstract Syntax Tree (AST)
#[instrument(level = "debug")]
pub fn parse_sql(sql: &str) -> Result<Vec<Statement>, ParserError> {
Expand Down
161 changes: 161 additions & 0 deletions src/sqlparser/src/parser_v2/data_type.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
use core::cell::RefCell;
use std::rc::Rc;

use winnow::combinator::{alt, delimited, dispatch, empty, fail, opt, separated, seq};
use winnow::error::{ContextError, ErrMode};
use winnow::{PResult, Parser, Stateful};

use super::{
identifier_non_reserved, keyword, literal_uint, precision_in_range, token, with_state,
TokenStream,
};
use crate::ast::{DataType, StructField};
use crate::keywords::Keyword;
use crate::tokenizer::Token;

#[derive(Default, Debug)]
struct DataTypeParsingState {
/// When we consumed an [`Token::ShiftRight`], we set this to true.
remaining_close: Rc<RefCell<bool>>,
}

type StatefulStream<S> = Stateful<S, DataTypeParsingState>;

fn struct_data_type<S>(input: &mut StatefulStream<S>) -> PResult<Vec<StructField>>
where
S: TokenStream,
{
let remaining_close1 = input.state.remaining_close.clone();
let remaining_close2 = input.state.remaining_close.clone();

let consume_close = alt((
move |_input: &mut StatefulStream<S>| -> PResult<()> {
if *remaining_close1.borrow() {
Ok(())
} else {
Err(ErrMode::Backtrack(ContextError::new()))
}
}
.void(),
(
Token::ShiftRight,
move |_input: &mut StatefulStream<S>| -> PResult<()> {
*remaining_close2.borrow_mut() = true;
Ok(())
},
)
.void(),
Token::Gt.void(),
));

delimited(
Token::Lt,
separated(
1..,
seq! {
StructField {
name: identifier_non_reserved,
_: Token::Colon,
data_type: data_type_stateful,
}
},
Token::Comma,
),
consume_close,
)
.parse_next(input)
}

pub fn data_type<S>(input: &mut S) -> PResult<DataType>
where
S: TokenStream,
{
with_state::<S, DataTypeParsingState, _, _>(data_type_stateful).parse_next(input)
}

fn data_type_stateful<S>(input: &mut StatefulStream<S>) -> PResult<DataType>
where
S: TokenStream,
{
(
data_type_stateful_inner,
opt((Token::LBracket, Token::RBracket)),
)
.map(|(dt, is_array)| {
if is_array.is_some() {
DataType::Array(Box::new(dt))
} else {
dt
}
})
.parse_next(input)
}

fn data_type_stateful_inner<S>(input: &mut StatefulStream<S>) -> PResult<DataType>
where
S: TokenStream,
{
let with_time_zone = || {
opt(alt((
(Keyword::WITH, Keyword::TIME, Keyword::ZONE).value(true),
(Keyword::WITHOUT, Keyword::TIME, Keyword::ZONE).value(false),
)))
.map(|x| x.unwrap_or(false))
};

let precision_and_scale = || {
opt(delimited(
Token::LParen,
(
literal_uint,
opt((Token::Comma, literal_uint).map(|(_, x)| x)),
),
Token::RParen,
))
.map(|p| match p {
Some((x, y)) => (Some(x), y),
None => (None, None),
})
};

let keywords = dispatch! {keyword;
Keyword::BOOLEAN | Keyword::BOOL => empty.value(DataType::Boolean),
Keyword::FLOAT => opt(precision_in_range(1..53)).map(|precision| DataType::Float(precision)),
Keyword::REAL => empty.value(DataType::Real),
Keyword::DOUBLE => Keyword::PRECISION.value(DataType::Double),
Keyword::SMALLINT => empty.value(DataType::SmallInt),
Keyword::INT | Keyword::INTEGER => empty.value(DataType::Int),
Keyword::BIGINT => empty.value(DataType::BigInt),
Keyword::STRING | Keyword::VARCHAR => empty.value(DataType::Varchar),
Keyword::CHAR | Keyword::CHARACTER => dispatch! {keyword;
Keyword::VARYING => empty.value(DataType::Varchar),
_ => opt(precision_in_range(..)).map(|precision| DataType::Char(precision)),
},
Keyword::UUID => empty.value(DataType::Uuid),
Keyword::DATE => empty.value(DataType::Date),
Keyword::TIMESTAMP => with_time_zone().map(|with_tz| DataType::Timestamp(with_tz)),
Keyword::TIME => with_time_zone().map(|with_tz| DataType::Time(with_tz)),
// TODO: Support complex inverval type parsing.

Check warning on line 138 in src/sqlparser/src/parser_v2/data_type.rs

View workflow job for this annotation

GitHub Actions / Spell Check with Typos

"inverval" should be "interval".
TennyZhuang marked this conversation as resolved.
Show resolved Hide resolved
Keyword::INTERVAL => empty.value(DataType::Interval),
Keyword::REGCLASS => empty.value(DataType::Regclass),
Keyword::REGPROC => empty.value(DataType::Regproc),
Keyword::STRUCT => struct_data_type.map(DataType::Struct),
Keyword::BYTEA => empty.value(DataType::Bytea),
Keyword::NUMERIC | Keyword::DECIMAL | Keyword::DEC => precision_and_scale().map(|(precision, scale)| {
DataType::Decimal(precision, scale)
}),
_ => fail::<_, DataType, _>,
};

alt((
keywords,
// JSONB is not a keyword, but a special data type.
token
.verify(|t| match &t.token {
Token::Word(w) if w.value.eq_ignore_ascii_case("jsonb") => true,
_ => false,
})
.value(DataType::Jsonb),
))
.parse_next(input)
}
142 changes: 142 additions & 0 deletions src/sqlparser/src/parser_v2/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

use winnow::error::ContextError;
use winnow::stream::{Location, Stream, StreamIsPartial};
use winnow::token::any;
use winnow::{PResult, Parser, Stateful};

use crate::ast::Ident;
use crate::keywords::{self, Keyword};
use crate::tokenizer::{Token, TokenWithLocation};

mod data_type;
mod number;

pub(crate) use data_type::*;
pub(crate) use number::*;

trait TokenStream: Stream<Token = TokenWithLocation> + StreamIsPartial + Location + Default {}

impl<S> TokenStream for S where
TennyZhuang marked this conversation as resolved.
Show resolved Hide resolved
S: Stream<Token = TokenWithLocation> + StreamIsPartial + Location + Default
{
}

fn token<S>(input: &mut S) -> PResult<TokenWithLocation>
where
S: TokenStream,
{
any(input)
}

fn keyword<S>(input: &mut S) -> PResult<Keyword>
where
S: TokenStream,
{
token
.verify_map(|t| match &t.token {
Token::Word(w) if w.keyword != Keyword::NoKeyword => Some(w.keyword),
_ => None,
})
.parse_next(input)
}

impl<I> Parser<I, TokenWithLocation, ContextError> for Token
where
I: TokenStream,
{
fn parse_next(&mut self, input: &mut I) -> PResult<TokenWithLocation, ContextError> {
token
.verify(move |t: &TokenWithLocation| t.token == *self)
.parse_next(input)
}
}

impl<I> Parser<I, Keyword, ContextError> for Keyword
where
I: TokenStream,
{
fn parse_next(&mut self, input: &mut I) -> PResult<Keyword, ContextError> {
token
.verify_map(move |t| match &t.token {
Token::Word(w) if *self == w.keyword => Some(w.keyword),
_ => None,
})
.parse_next(input)
}
}

fn identifier_non_reserved<S>(input: &mut S) -> PResult<Ident>
where
S: TokenStream,
{
// FIXME: Reporting error correctly.
token
.verify_map(|t| match &t.token {
Token::Word(w) if !keywords::RESERVED_FOR_COLUMN_OR_TABLE_NAME.contains(&w.keyword) => {
w.to_ident().ok()
}
_ => None,
})
.parse_next(input)
}

fn with_state<S, State, O, ParseNext>(mut parse_next: ParseNext) -> impl Parser<S, O, ContextError>
where
S: TokenStream,
State: Default,
ParseNext: Parser<Stateful<S, State>, O, ContextError>,
{
move |input: &mut S| -> PResult<O> {
let state = State::default();
let input2 = std::mem::take(input);
let mut stateful = Stateful {
input: input2,
state,
};
let output = parse_next.parse_next(&mut stateful);
*input = stateful.input;
output
}
}

#[cfg(test)]
mod tests {
use winnow::Located;

use super::*;
use crate::tokenizer::Tokenizer;

#[test]
fn test_basic() {
let input = "SELECT 1";
let tokens = Tokenizer::new(input).tokenize_with_location().unwrap();
let mut token_stream = Located::new(&*tokens);
Token::make_keyword("SELECT")
.parse_next(&mut token_stream)
.unwrap();
}

#[test]
fn test_stateful() {
let input = "SELECT 1";
let tokens = Tokenizer::new(input).tokenize_with_location().unwrap();
let mut token_stream = Located::new(&*tokens);
with_state(|input: &mut Stateful<_, usize>| -> PResult<()> {
input.state += 1;
Token::make_keyword("SELECT").void().parse_next(input)
})
.parse_next(&mut token_stream)
.unwrap();
}
}
Loading
Loading