-
Notifications
You must be signed in to change notification settings - Fork 17
/
lexer.go
229 lines (201 loc) · 5.12 KB
/
lexer.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
package sigma
import (
"fmt"
"strings"
"unicode"
"unicode/utf8"
)
type lexer struct {
input string // we'll store the string being parsed
start int // the position we started scanning
position int // the current position of our scan
width int // we'll be using runes which can be double byte
items chan Item // the channel we'll use to communicate between the lexer and the parser
}
// lex creates a lexer and starts scanning the provided input.
func lex(input string) *lexer {
l := &lexer{
input: input,
items: make(chan Item), // unbuffered
}
go l.scan()
return l
}
// ignore resets the start position to the current scan position effectively
// ignoring any input.
func (l *lexer) ignore() {
l.start = l.position
}
// next advances the lexer state to the next rune.
func (l *lexer) next() (r rune) {
if l.position >= len(l.input) {
l.width = 0
return eof
}
r, l.width = utf8.DecodeRuneInString(l.todo())
l.position += l.width
return r
}
// backup allows us to step back one rune which is helpful when you've crossed
// a boundary from one state to another.
func (l *lexer) backup() {
l.position = l.position - 1
}
// scan will step through the provided text and execute state functions as
// state changes are observed in the provided input.
func (l *lexer) scan() {
// When we begin processing, let's assume we're going to process text.
// One state function will return another until `nil` is returned to signal
// the end of our process.
for fn := lexCondition; fn != nil; {
fn = fn(l)
}
close(l.items)
}
func (l *lexer) unsuppf(format string, args ...interface{}) stateFn {
msg := fmt.Sprintf(format, args...)
l.items <- Item{T: TokUnsupp, Val: msg}
return nil
}
func (l *lexer) errorf(format string, args ...interface{}) stateFn {
msg := fmt.Sprintf(format, args...)
l.items <- Item{T: TokErr, Val: msg}
return nil
}
// emit sends a item over the channel so the parser can collect and manage
// each segment.
func (l *lexer) emit(k Token) {
i := Item{T: k, Val: l.input[l.start:l.position]}
l.items <- i
l.ignore() // reset our scanner now that we've dispatched a segment
}
func (l lexer) collected() string { return l.input[l.start:l.position] }
func (l lexer) todo() string { return l.input[l.position:] }
// stateFn is a function that is specific to a state within the string.
type stateFn func(*lexer) stateFn
// lexCondition scans what is expected to be text.
func lexCondition(l *lexer) stateFn {
for {
if strings.HasPrefix(l.todo(), TokStOne.Literal()) {
return lexOneOf
}
if strings.HasPrefix(l.todo(), TokStAll.Literal()) {
return lexAllOf
}
switch r := l.next(); {
case r == eof:
return lexEOF
case r == TokSepRpar.Rune():
return lexRparWithTokens
case r == TokSepLpar.Rune():
return lexLpar
case r == TokSepPipe.Rune():
return lexPipe
case unicode.IsSpace(r):
return lexAccumulateBeforeWhitespace
}
}
}
func lexStatement(l *lexer) stateFn {
return lexCondition
}
func lexOneOf(l *lexer) stateFn {
l.position += len(TokStOne.Literal())
l.emit(TokStOne)
return lexCondition
}
func lexAllOf(l *lexer) stateFn {
l.position += len(TokStAll.Literal())
l.emit(TokStAll)
return lexCondition
}
func lexAggs(l *lexer) stateFn {
return l.unsuppf("aggregation not supported yet [%s]", l.input)
}
func lexEOF(l *lexer) stateFn {
if l.position > l.start {
l.emit(checkKeyWord(l.collected()))
}
l.emit(TokLitEof)
return nil
}
func lexPipe(l *lexer) stateFn {
l.emit(TokSepPipe)
return lexAggs
}
func lexLpar(l *lexer) stateFn {
l.emit(TokSepLpar)
return lexCondition
}
func lexRparWithTokens(l *lexer) stateFn {
// emit any text we've accumulated.
if l.position > l.start {
l.backup()
// There may be N whitespace chars between token RPAR
// TODO - may be a more concise way to do this, right now loops like this are everywhere
if t := checkKeyWord(l.collected()); t != TokNil {
l.emit(t)
}
for {
switch r := l.next(); {
case r == eof:
return lexEOF
case unicode.IsSpace(r):
l.ignore()
default:
return lexRpar
}
}
}
return lexRpar
}
func lexRpar(l *lexer) stateFn {
l.emit(TokSepRpar)
return lexCondition
}
func lexAccumulateBeforeWhitespace(l *lexer) stateFn {
l.backup()
// emit any text we've accumulated.
if l.position > l.start {
l.emit(checkKeyWord(l.collected()))
}
return lexWhitespace
}
// lexWhitespace scans what is expected to be whitespace.
func lexWhitespace(l *lexer) stateFn {
for {
switch r := l.next(); {
case r == eof:
return lexEOF
case !unicode.IsSpace(r):
l.backup()
return lexCondition
default:
l.ignore()
}
}
}
func checkKeyWord(in string) Token {
if len(in) == 0 {
return TokNil
}
switch strings.ToLower(in) {
case TokKeywordAnd.Literal():
return TokKeywordAnd
case TokKeywordOr.Literal():
return TokKeywordOr
case TokKeywordNot.Literal():
return TokKeywordNot
case "sum", "min", "max", "count", "avg":
return TokKeywordAgg
case TokIdentifierAll.Literal():
return TokIdentifierAll
case TokStOne.Literal():
return TokStOne
default:
if strings.Contains(in, "*") {
return TokIdentifierWithWildcard
}
return TokIdentifier
}
}