-
Notifications
You must be signed in to change notification settings - Fork 0
Lite PEG
duangsuse edited this page May 9, 2018
·
6 revisions
Work in progress, 也有可能不会出,因为 duangsuse 现在还太菜,不会用解析器生成器, 垃圾 @duangsuse
- 为什么不会呢
duangsuse 还无法理解为什么 PEG 可以通过奇怪的 add -> multiply '+'
add 这样的语法规则去解决运算符优先级的问题
duangsuse 还无法理解解析中的交换律
duangsuse 并不了解正则表达式,不过这个可以学,但是内容量比较大
- PEG 是什么?
P arsing E xpression G rammar, 一种描述解析器语法规则的语法
- 看起来怎么样?
// Simple Arithmetics Grammar
// ==========================
//
// Accepts expressions like "2 * (3 + 4)" and computes their value.
Expression
= head:Term tail:(_ ("+" / "-") _ Term)* {
return tail.reduce(function(result, element) {
if (element[1] === "+") { return result + element[3]; }
if (element[1] === "-") { return result - element[3]; }
}, head);
}
Term
= head:Factor tail:(_ ("*" / "/") _ Factor)* {
return tail.reduce(function(result, element) {
if (element[1] === "*") { return result * element[3]; }
if (element[1] === "/") { return result / element[3]; }
}, head);
}
Factor
= "(" _ expr:Expression _ ")" { return expr; }
/ Integer
Integer "integer"
= _ [0-9]+ { return parseInt(text(), 10); }
_ "whitespace"
= [ \t\n\r]*
能吃吗
不能吃呦 😺
- 所以到底有没有
有的
// Complete Lite Desugared Syntax (Ohm PEG)
// Lite parser by duangsuse, no rights reserved (lexical rules see https://ohmlang.github.io/editor)
Lite {
// The JavaScript lexical rules
// §A.1 Lexical Grammar -- https://es5.github.io/#A.1
Program = CompStmt
sourceCharacter = any
// Override Ohm's built-in definition of space.
space := whitespace | comment
whitespace = "\t"
| "\x0B" -- verticalTab
| "\x0C" -- formFeed
| " "
| "\u00A0" -- noBreakSpace
| "\uFEFF" -- byteOrderMark
| unicodeSpaceSeparator
lineTerminator = "\n" | "\r" | "\u2028" | "\u2029"
lineTerminatorSequence = "\n" | "\r" ~"\n" | "\u2028" | "\u2029" | "\r\n"
comment = multiLineComment | singleLineComment
multiLineComment = ">####<" (~"<####>" sourceCharacter)* "<####>"
singleLineComment = "#" (~lineTerminator sourceCharacter)*
identifier (an identifier) = "@"? ~reservedWord identifierName
identifierName = identifierStart identifierPart*
identifierStart = letter | "$" | "_"
| "\\" unicodeEscapeSequence -- escaped
identifierPart = identifierStart | unicodeCombiningMark
| unicodeDigit | unicodeConnectorPunctuation
| "\u200C" | "\u200D"
letter += unicodeCategoryNl
unicodeCategoryNl
= "\u2160".."\u2182" | "\u3007" | "\u3021".."\u3029"
unicodeDigit (a digit)
= "\u0030".."\u0039" | "\u0660".."\u0669" | "\u06F0".."\u06F9" | "\u0966".."\u096F" | "\u09E6".."\u09EF" | "\u0A66".."\u0A6F" | "\u0AE6".."\u0AEF" | "\u0B66".."\u0B6F" | "\u0BE7".."\u0BEF" | "\u0C66".."\u0C6F" | "\u0CE6".."\u0CEF" | "\u0D66".."\u0D6F" | "\u0E50".."\u0E59" | "\u0ED0".."\u0ED9" | "\u0F20".."\u0F29" | "\uFF10".."\uFF19"
unicodeCombiningMark (a Unicode combining mark)
= "\u0300".."\u0345" | "\u0360".."\u0361" | "\u0483".."\u0486" | "\u0591".."\u05A1" | "\u05A3".."\u05B9" | "\u05BB".."\u05BD" | "\u05BF".."\u05BF" | "\u05C1".."\u05C2" | "\u05C4".."\u05C4" | "\u064B".."\u0652" | "\u0670".."\u0670" | "\u06D6".."\u06DC" | "\u06DF".."\u06E4" | "\u06E7".."\u06E8" | "\u06EA".."\u06ED" | "\u0901".."\u0902" | "\u093C".."\u093C" | "\u0941".."\u0948" | "\u094D".."\u094D" | "\u0951".."\u0954" | "\u0962".."\u0963" | "\u0981".."\u0981" | "\u09BC".."\u09BC" | "\u09C1".."\u09C4" | "\u09CD".."\u09CD" | "\u09E2".."\u09E3" | "\u0A02".."\u0A02" | "\u0A3C".."\u0A3C" | "\u0A41".."\u0A42" | "\u0A47".."\u0A48" | "\u0A4B".."\u0A4D" | "\u0A70".."\u0A71" | "\u0A81".."\u0A82" | "\u0ABC".."\u0ABC" | "\u0AC1".."\u0AC5" | "\u0AC7".."\u0AC8" | "\u0ACD".."\u0ACD" | "\u0B01".."\u0B01" | "\u0B3C".."\u0B3C" | "\u0B3F".."\u0B3F" | "\u0B41".."\u0B43" | "\u0B4D".."\u0B4D" | "\u0B56".."\u0B56" | "\u0B82".."\u0B82" | "\u0BC0".."\u0BC0" | "\u0BCD".."\u0BCD" | "\u0C3E".."\u0C40" | "\u0C46".."\u0C48" | "\u0C4A".."\u0C4D" | "\u0C55".."\u0C56" | "\u0CBF".."\u0CBF" | "\u0CC6".."\u0CC6" | "\u0CCC".."\u0CCD" | "\u0D41".."\u0D43" | "\u0D4D".."\u0D4D" | "\u0E31".."\u0E31" | "\u0E34".."\u0E3A" | "\u0E47".."\u0E4E" | "\u0EB1".."\u0EB1" | "\u0EB4".."\u0EB9" | "\u0EBB".."\u0EBC" | "\u0EC8".."\u0ECD" | "\u0F18".."\u0F19" | "\u0F35".."\u0F35" | "\u0F37".."\u0F37" | "\u0F39".."\u0F39" | "\u0F71".."\u0F7E" | "\u0F80".."\u0F84" | "\u0F86".."\u0F87" | "\u0F90".."\u0F95" | "\u0F97".."\u0F97" | "\u0F99".."\u0FAD" | "\u0FB1".."\u0FB7" | "\u0FB9".."\u0FB9" | "\u20D0".."\u20DC" | "\u20E1".."\u20E1" | "\u302A".."\u302F" | "\u3099".."\u309A" | "\uFB1E".."\uFB1E" | "\uFE20".."\uFE23"
unicodeConnectorPunctuation = "\u005F" | "\u203F".."\u2040" | "\u30FB" | "\uFE33".."\uFE34" | "\uFE4D".."\uFE4F" | "\uFF3F" | "\uFF65"
unicodeSpaceSeparator = "\u2000".."\u200B" | "\u3000"
reservedWord = keyword | nullLiteral | booleanLiteral
// Note: keywords that are the complete prefix of another keyword should
// be prioritized (e.g. 'in' should come before 'instanceof')
keyword = break | do | scope | in
| to | else | elif | if
| as | next | return | endKeyword
| or | for | and | while
| require | def | import
/*
Note: Punctuator and DivPunctuator (see https://es5.github.io/x7.html#x7.7) are
not currently used by this grammar.
*/
literal = nullLiteral | booleanLiteral | numericLiteral
| stringLiteral
nullLiteral = "nil" ~identifierPart
booleanLiteral = ("true" | "false") ~identifierPart
// For semantics on how decimal literals are constructed, see section 7.8.3
// Note that the ordering of hexIntegerLiteral and decimalLiteral is reversed w.r.t. the spec
// This is intentional: the order decimalLiteral | hexIntegerLiteral will parse
// "0x..." as a decimal literal "0" followed by "x..."
numericLiteral = octalIntegerLiteral | hexIntegerLiteral | decimalLiteral
decimalLiteral = decimalIntegerLiteral "." decimalDigit* exponentPart -- bothParts
| "." decimalDigit+ exponentPart -- decimalsOnly
| decimalIntegerLiteral exponentPart -- integerOnly
decimalIntegerLiteral = nonZeroDigit decimalDigit* -- nonZero
| "0" -- zero
decimalDigit = "0".."9"
nonZeroDigit = "1".."9"
exponentPart = exponentIndicator signedInteger -- present
| -- absent
exponentIndicator = "e" | "E"
signedInteger = "+" decimalDigit* -- positive
| "-" decimalDigit* -- negative
| decimalDigit+ -- noSign
hexIntegerLiteral = "0x" hexDigit+
| "0X" hexDigit+
// hexDigit defined in Ohm's built-in rules (otherwise: hexDigit = "0".."9" | "a".."f" | "A".."F")
octalIntegerLiteral = "0" octalDigit+
octalDigit = "0".."7"
// For semantics on how string literals are constructed, see section 7.8.4
stringLiteral = "\"" doubleStringCharacter* "\""
| "'" singleStringCharacter* "'"
doubleStringCharacter = ~("\"" | "\\" | lineTerminator) sourceCharacter -- nonEscaped
| "\\" escapeSequence -- escaped
| lineContinuation -- lineContinuation
singleStringCharacter = ~("'" | "\\" | lineTerminator) sourceCharacter -- nonEscaped
| "\\" escapeSequence -- escaped
| lineContinuation -- lineContinuation
lineContinuation = "\\" lineTerminatorSequence
escapeSequence = unicodeEscapeSequence
| hexEscapeSequence
| octalEscapeSequence
| characterEscapeSequence // Must come last.
characterEscapeSequence = singleEscapeCharacter
| nonEscapeCharacter
singleEscapeCharacter = "'" | "\"" | "\\" | "b" | "f" | "n" | "r" | "t" | "v"
nonEscapeCharacter = ~(escapeCharacter | lineTerminator) sourceCharacter
escapeCharacter = singleEscapeCharacter | decimalDigit | "x" | "u"
octalEscapeSequence = zeroToThree octalDigit octalDigit -- whole
| fourToSeven octalDigit -- eightTimesfourToSeven
| zeroToThree octalDigit ~decimalDigit -- eightTimesZeroToThree
| octalDigit ~decimalDigit -- octal
hexEscapeSequence = "x" hexDigit hexDigit
unicodeEscapeSequence = "u" hexDigit hexDigit hexDigit hexDigit
zeroToThree = "0".."3"
fourToSeven = "4".."7"
// === Implementation-level rules (not part of the spec) ===
// A semicolon is "automatically inserted" if a newline or the end of the input stream is
// reached, or the offending token is "}".
// See https://es5.github.io/#x7.9 for more information.
// NOTE: Applications of this rule *must* appear in a lexical context -- either in the body of a
// lexical rule, or inside `#()`.
sc = ";" | end | lineTerminator | comment
// Convenience rules for parsing keyword tokens.
break = "break" ~identifierPart
do = "do" ~identifierPart
scope = "scope" ~identifierPart
in = "in" ~identifierPart
else = "else" ~identifierPart
elif = "elif" ~identifierPart
if = "if" ~identifierPart
as = "as" ~identifierPart
next = "next" ~identifierPart
return = "return" ~identifierPart
endKeyword = "end" ~identifierPart
or = "or" ~identifierPart
for = "for" ~identifierPart
and = "and" ~identifierPart
while = "while" ~identifierPart
require = "require" ~identifierPart
def = "def" ~identifierPart
import = "import" ~identifierPart
to = "to" ~identifierPart
// end of modified javascript lexical rules
// start of expressions
// lite operator precedence
// | or in
// & and
// < > <= >= != == !== ===
// <<
// to
// + -
// * / %
// ** :: as
// Unary- ! ++ -- .
// left recursion
Exp
= OrExp
OrExp
= OrExp "|" AndExp -- or
| OrExp or AndExp -- orKeyword
| OrExp in AndExp -- in
| AndExp
AndExp
= AndExp "&" RelationExp -- and
| AndExp and RelationExp -- andKeyword
| RelationExp
RelationExp
= RelationExp "<" ShiftExp -- lessThan
| RelationExp ">" ShiftExp -- greaterThan
| RelationExp "<=" ShiftExp -- lessEqual
| RelationExp ">=" ShiftExp -- greaterEqual
| RelationExp "!=" ShiftExp -- notEqual
| RelationExp "==" ShiftExp -- equal
| RelationExp "!==" ShiftExp -- notFullEqual
| RelationExp "===" ShiftExp -- fullEqual
| ShiftExp
ShiftExp
= ShiftExp "<<" RangeExp -- shift
| RangeExp
RangeExp
= RangeExp to AddExp -- range
| AddExp
AddExp
= AddExp "+" MulExp -- plus
| AddExp "-" MulExp -- minus
| MulExp
MulExp
= MulExp "*" ExpExp -- times
| MulExp "/" ExpExp -- divide
| MulExp "%" ExpExp -- remainder
| ExpExp
ExpExp
= ExpExp "**" ExpExp -- power
| ExpExp "::" identifier -- square
| ExpExp as identifier -- as
| PriExp
PriExp
= "(" Exp ")" -- paren
| "-" PriExp -- neg
| "!" PriExp -- not
| identifier "++" -- inc
| identifier "--" -- dec
| literal -- literal
| Call -- callExp
| LiteExpr -- liteExp
LiteExpr
= List | Table | BraceBlock | DoBlock
Divider
= (", " | " " | ",")
List
= "[" ExpList "]" -- simpleList
| ":[" (~"]" sourceCharacter)* "]" -- wordList
ExpList
= (Divider? Exp)*
Table
= "{" KvList "}"
KvList
= (identifier ":" Exp ("," | "\n")?)*
Call
= Call "(" ExpList ")" -- call
| Call "." identifier -- callIndex
| Call "[" Exp "]" -- justIndex
| Call ExpList -- callEasy
| identifier ~"=" -- justIdentifier
BraceBlock
= "{" NameListB? (":"? SimpleStatement)* "}"
NameList
= "("? (Divider? identifier)* ")"?
NameListB
= "|" (Divider? identifier)* "|"
DoBlock
= do NameListB? Block
// end Exp part
SimpleStatement
= Exp -- expressionStatement
| Break -- break
| Next -- continue
| Import -- import
| Require -- require
| Return -- return
| Assign -- assignment
| IndexEq -- indexLet
| Arrow -- arrowLet
Break
= break
Next
= next
Import
= import (~lineTerminator sourceCharacter)*
Require
= require (~lineTerminator sourceCharacter)*
Return
= return Exp?
Assign
= identifier "=" Exp
IndexEq
= Exp "[" Exp "]" "=" Exp
Arrow
= Exp "->" identifier Exp
Statement
= SimpleStatement -- simpleStatement
| Def -- defineMethod
| For -- forLoop
| While -- whileLoop
| Scope -- scope
| If -- controlFlow
| "\n" -- nop
Def
= def identifier sc Block -- defEasy
| def identifier sc Exp sc -- defExpr
| def identifier NameList sc Block -- def
For
= for identifier in Exp sc Block
While
= while Exp sc Block
Scope
= scope identifier? sc Block
If
= if Exp sc Block -- simpleEnd
| if Exp sc CompStmt else Block -- ifElse
| if Exp sc CompStmt (elif Exp sc CompStmt)* (else CompStmt)? endKeyword -- ifElif
Block
= CompStmt endKeyword
CompStmt
= (Statement sc?)*
}