Skip to content

Commit

Permalink
support range expression
Browse files Browse the repository at this point in the history
  • Loading branch information
goropikari committed Aug 28, 2022
1 parent 2f793fc commit 6d2e42a
Show file tree
Hide file tree
Showing 7 changed files with 1,549 additions and 97 deletions.
2 changes: 1 addition & 1 deletion automata/dfa_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ func TestDot(t *testing.T) {
// generate dot file
// go test ./automata/ -run TestDot

s, _ := lexerNFA([]string{"a", "abb", "a*bb*"}).ToDFA().LexerMinimize().RemoveBH().ToDot()
s, _ := lexerNFA([]string{"a", "abb", "a*bb*", "e[^a-zA-Z0-9]*h", "f[a-d]e"}).ToDFA().LexerMinimize().RemoveBH().ToDot()
err := os.WriteFile("ex.dot", []byte(s), 0666)
if err != nil {
log.Fatal(err)
Expand Down
21 changes: 21 additions & 0 deletions compiler/regexp/lexer.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,13 @@ const (
SymbolTokenType TokenType = iota + 1
DotTokenType
StarTokenType
MinusTokenType
LParenTokenType
RParenTokenType
LSqBracketTokenType
RSqBracketTokenType
BarTokenType
NegationTokenType
)

type Token struct {
Expand Down Expand Up @@ -99,10 +103,27 @@ func (lex *Lexer) Scan() []Token {
typ = SymbolTokenType
case '*':
typ = StarTokenType
case '-':
typ = MinusTokenType
case '(':
typ = LParenTokenType
case ')':
typ = RParenTokenType
case '[':
lex.tokens = append(lex.tokens, NewToken(LSqBracketTokenType, ru))
ru2, err := lex.read()
if errors.Is(err, io.EOF) {
panic(ErrInvalidRegex)
}
ru = ru2
switch ru {
case '^':
typ = NegationTokenType
default:
typ = SymbolTokenType
}
case ']':
typ = RSqBracketTokenType
case '|':
typ = BarTokenType
case '.':
Expand Down
17 changes: 16 additions & 1 deletion compiler/regexp/lexer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ func TestLexer_Scan(t *testing.T) {
}{
{
name: "lexer test",
regex: "a(b|c*)deあいう\t\n",
regex: "a(b|c*)deあいう\t\n[a-z][^A-Z]\\+\\-\\*/",
expected: []regexp.Token{
regexp.NewToken(regexp.SymbolTokenType, 'a'),
regexp.NewToken(regexp.LParenTokenType, '('),
Expand All @@ -31,6 +31,21 @@ func TestLexer_Scan(t *testing.T) {
regexp.NewToken(regexp.SymbolTokenType, 'う'),
regexp.NewToken(regexp.SymbolTokenType, '\t'),
regexp.NewToken(regexp.SymbolTokenType, '\n'),
regexp.NewToken(regexp.LSqBracketTokenType, '['),
regexp.NewToken(regexp.SymbolTokenType, 'a'),
regexp.NewToken(regexp.MinusTokenType, '-'),
regexp.NewToken(regexp.SymbolTokenType, 'z'),
regexp.NewToken(regexp.RSqBracketTokenType, ']'),
regexp.NewToken(regexp.LSqBracketTokenType, '['),
regexp.NewToken(regexp.NegationTokenType, '^'),
regexp.NewToken(regexp.SymbolTokenType, 'A'),
regexp.NewToken(regexp.MinusTokenType, '-'),
regexp.NewToken(regexp.SymbolTokenType, 'Z'),
regexp.NewToken(regexp.RSqBracketTokenType, ']'),
regexp.NewToken(regexp.SymbolTokenType, '+'),
regexp.NewToken(regexp.SymbolTokenType, '-'),
regexp.NewToken(regexp.SymbolTokenType, '*'),
regexp.NewToken(regexp.SymbolTokenType, '/'),
},
},
}
Expand Down
120 changes: 101 additions & 19 deletions compiler/regexp/parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ package regexp
import (
"errors"
"io"

"github.com/goropikari/golex/automata"
"github.com/goropikari/golex/collection"
)

var (
Expand Down Expand Up @@ -83,6 +86,75 @@ func (p *Parser) sum() (RegexExpr, error) {
return lhs, nil
}

func (p *Parser) set() (RegexExpr, error) {
neg := false
runes := make([]rune, 0)
var prev rune

for {
tok, err := p.peek()
if err != nil {
return nil, err
}
switch tok.GetType() {
case RSqBracketTokenType:
if prev == '-' {
return nil, ErrParse
}
goto Out
case NegationTokenType:
prev = tok.GetRune()
neg = true
case MinusTokenType:
prev = tok.GetRune()
default:
ru := tok.GetRune()
if prev == '-' {
from := runes[len(runes)-1]
if from > ru {
return nil, ErrParse
}
for t := from + 1; t < ru; t++ {
runes = append(runes, t)
}
}
runes = append(runes, ru)
prev = ru
}
_, _ = p.read()
}
Out:
var expr RegexExpr
if !neg {
expr = NewSymbolExpr(runes[0])
if len(runes) == 1 {
return expr, nil
}

for i := 1; i < len(runes); i++ {
rhs := NewSymbolExpr(runes[i])
expr = NewSumExpr(expr, rhs)
}
return expr, nil
}

ruSet := collection.NewSet[rune]()
for _, ru := range runes {
ruSet.Insert(ru)
}
for _, ru := range automata.SupportedChars {
if !ruSet.Contains(ru) {
if expr == nil {
expr = NewSymbolExpr(ru)
} else {
expr = NewSumExpr(expr, NewSymbolExpr(ru))
}
}
}

return expr, nil
}

func (p *Parser) concat() (RegexExpr, error) {
lhs, err := p.star()
if err != nil {
Expand All @@ -98,7 +170,7 @@ func (p *Parser) concat() (RegexExpr, error) {
}

switch b.GetType() {
case SymbolTokenType, DotTokenType, LParenTokenType:
case SymbolTokenType, DotTokenType, LParenTokenType, LSqBracketTokenType:
rhs, err := p.concat()
if err != nil {
return nil, err
Expand Down Expand Up @@ -133,28 +205,38 @@ func (p *Parser) primary() (RegexExpr, error) {
if err != nil {
return nil, err
}
if s.GetType() == SymbolTokenType {

switch s.GetType() {
case SymbolTokenType:
return NewSymbolExpr(s.GetRune()), nil
}
if s.GetType() == DotTokenType {
case DotTokenType:
return NewDotExpr(), nil
}
if s.GetType() != LParenTokenType {
return nil, ErrParse
case LParenTokenType:
sum, err := p.sum()
if err != nil {
return nil, err
}
r, err := p.read()
if err != nil {
return nil, err
}
if r.GetType() == RParenTokenType {
return sum, nil
}
case LSqBracketTokenType:
set, err := p.set()
if err != nil {
return nil, err
}
r, err := p.read()
if err != nil {
return nil, err
}
if r.GetType() == RSqBracketTokenType {
return set, nil
}
}

// grouping expr
sum, err := p.sum()
if err != nil {
return nil, err
}
r, err := p.read()
if err != nil {
return nil, err
}
if r.GetType() == RParenTokenType {
return sum, nil
}
return nil, ErrParse
}

Expand Down
129 changes: 112 additions & 17 deletions sample/README.md
Original file line number Diff line number Diff line change
@@ -1,37 +1,132 @@

Lexical analize following function defition.
```go
func foo000() int {
x := 1 * 10 + 123 - 1000 / 5432

return x
}
```


```bash
$ go install github.com/goropikari/[email protected]
$ golex sample.l
$ go run golex.yy.go

2 abb
3 ab
1 a
1 a
1 a
Keyword
"func"
Identifier
"foo000"
LParen
"("
RParen
")"
Identifier
"int"
LBracket
"{"
Identifier
"x"
Operator
":="
Digit
"1"
Operator
"*"
Digit
"10"
Operator
"+"
Digit
"123"
Operator
"-"
Digit
"1000"
Operator
"/"
Digit
"5432"
Keyword
"return"
Identifier
"x"
RBracket
"}"
2022/08/29 01:39:20 EOF
exit status 1
```


`golex.yy.go`
`sample.l`
```go
// "a" { return State1, nil }
// "abb" { return State2, nil }
// "a*bb*" { return State3, nil }
%{
import (
"fmt"
"log"
)

type Type = int
const (
Keyword Type = iota + 1
Identifier
Digit
Whitespace
LParen
RParen
LBracket
RBracket
Operator
)

%}

%%
"if|for|while|func" { return Keyword, nil }
"[a-zA-Z][a-zA-Z0-9]*" { return Identifier, nil }
"[1-9][0-9]*" { return Digit, nil }
"[ \t\n\r]*" { return Whitespace, nil }
"\\(" { return LParen, nil }
"\\)" { return RParen, nil }
"{" { return LBracket, nil }
"}" { return RBracket, nil }
"[\\+|\\-|\\*|/|:=|==|!=]" { return Operator, nil }
"." {}
%%

func main() {
lex := New("abbabaaa")
lex := New(`
func foo000() {
x := 1 * 10 + 123 - 1000 / 5432
}
`)
for {
n, err := lex.Next()
if err != nil {
log.Fatal(err)
return
}
switch n {
case State1:
fmt.Println(State1, YYtext)
case State2:
fmt.Println(State2, YYtext)
case State3:
fmt.Println(State3, YYtext)
case Keyword:
fmt.Println("Keyword")
case Identifier:
fmt.Println("Identifier")
case Digit:
fmt.Println("Digit")
case Whitespace:
fmt.Println("Whitespace")
case LParen:
fmt.Println("LParen")
case RParen:
fmt.Println("RParen")
case LBracket:
fmt.Println("LBracket")
case RBracket:
fmt.Println("RBracket")
case Operator:
fmt.Println("Operator")
}
fmt.Printf("\t %#v\n",YYtext)
}
}
```
Loading

0 comments on commit 6d2e42a

Please sign in to comment.