From c85b34eae9c20d3d4e3e9cd0a9fdce2f65e16237 Mon Sep 17 00:00:00 2001
From: Gani Georgiev <gani.georgiev@gmail.com>
Date: Sat, 22 Jan 2022 19:28:29 +0200
Subject: [PATCH] initial commit

---
 LICENSE.md       |  29 +++
 README.md        |  95 ++++++++++
 examples_test.go |  40 ++++
 go.mod           |   3 +
 parser.go        | 116 ++++++++++++
 parser_test.go   |  97 ++++++++++
 scanner.go       | 479 +++++++++++++++++++++++++++++++++++++++++++++++
 scanner_test.go  | 116 ++++++++++++
 8 files changed, 975 insertions(+)
 create mode 100644 LICENSE.md
 create mode 100644 README.md
 create mode 100644 examples_test.go
 create mode 100644 go.mod
 create mode 100644 parser.go
 create mode 100644 parser_test.go
 create mode 100644 scanner.go
 create mode 100644 scanner_test.go

diff --git a/LICENSE.md b/LICENSE.md
new file mode 100644
index 0000000..9ca6f39
--- /dev/null
+++ b/LICENSE.md
@@ -0,0 +1,29 @@
+BSD 3-Clause License
+
+Copyright (c) 2022, Gani Georgiev
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..8295f56
--- /dev/null
+++ b/README.md
@@ -0,0 +1,95 @@
+fexpr
+[![Go Report Card](https://goreportcard.com/badge/github.com/ganigeorgiev/fexpr)](https://goreportcard.com/report/github.com/ganigeorgiev/fexpr)
+[![GoDoc](https://godoc.org/github.com/ganigeorgiev/fexpr?status.svg)](https://pkg.go.dev/github.com/ganigeorgiev/fexpr)
+================================================================================
+
+**fexpr** is a filter query language parser that generates extremely easy to work with AST structure so that you can create safely SQL, Elasticsearch, etc. queries from user input.
+
+Or in other words, transform the string `"id > 1"` into the struct `[{&& {{identifier id} > {number 1}}}]`.
+
+Supports parenthesis and various conditional expression operators (see [Grammar](https://github.com/ganigeorgiev/fexpr#grammar)).
+
+
+## Example usage
+
+```
+go get github.com/ganigeorgiev/fexpr
+```
+
+```go
+package main
+
+import github.com/ganigeorgiev/fexpr
+
+func main() {
+    result, err := fexpr.Parse("id=123 && status='active'")
+    // result: [{&& {{identifier id} = {number 123}}} {&& {{identifier status} = {text active}}}]
+}
+```
+
+> Note that each parsed expression statement contains a join/union operator (`&&` or `||`) so that the result can be consumed on small chunks without having to rely on the group/nesting context.
+
+> See the [package documentation](https://pkg.go.dev/github.com/ganigeorgiev/fexpr) for more details and examples.
+
+## Grammar
+
+**fexpr** grammar resembles the SQL `WHERE` expression syntax. It recognizes several token types (identifiers, numbers, quoted text, expression operators, whitespaces, etc.).
+
+> You could find all supported tokens in [`scanner.go`](https://github.com/ganigeorgiev/fexpr/blob/master/scanner.go).
+
+
+#### Operators
+
+- **`=`**  Equal operator (eg. `a=b`)
+- **`!=`** NOT Equal operator (eg. `a!=b`)
+- **`>`**  Greater than operator (eg. `a>b`)
+- **`>=`** Greater than or equal operator (eg. `a>=b`)
+- **`<`**  Less than or equal operator (eg. `a<b`)
+- **`<=`** Less than or equal operator (eg. `a<=b`)
+- **`~`**  Like/Contains operator (eg. `a~b`)
+- **`!~`** NOT Like/Contains operator (eg. `a!~b`)
+- **`&&`** AND join operator (eg. `a=b && c=d`)
+- **`||`** OR join operator (eg. `a=b || c=d`)
+- **`()`** Parenthesis (eg. `(a=1 && b=2) || (a=3 && b=4)`)
+
+
+#### Numbers
+Number tokens are any integer or decimal numbers. **Example**: `123`, `10.50`.
+
+
+#### Identifiers
+
+Identifier tokens are literals that start with a letter, `_`, `@` or `#` and could contain further any number of digits or `.` (usually used as a separator).
+**Example**: `id`, `a.b.c`, `@request.method`, `field2`.
+
+
+#### Quoted text
+
+Text tokens are any literals that are wrapped by `'` or `"` quotes.
+**Example**: `'Lorem ipsum dolor 123!'`, `"escaped \"word\""`, `"mixed 'quotes' are fine"`.
+
+
+## Using only the scanner
+
+The tokenizer (aka. `fexpr.Scanner`) could be used without the parser's state machine so that you can write your own custom tokens processing:
+
+```go
+s := fexpr.NewScanner(strings.NewReader("id > 123"))
+
+// scan single token at a time until EOF or error is reached
+for {
+    t, err := s.Scan()
+    if t.Type == fexpr.TokenEOF || err != nil {
+        break
+    }
+
+    fmt.Println(t)
+}
+
+// Output:
+// {identifier id}
+// {whitespace  }
+// {sign >}
+// {whitespace  }
+// {number 123}
+```
diff --git a/examples_test.go b/examples_test.go
new file mode 100644
index 0000000..df2202d
--- /dev/null
+++ b/examples_test.go
@@ -0,0 +1,40 @@
+package fexpr_test
+
+import (
+	"fexpr"
+	"fmt"
+	"strings"
+)
+
+func ExampleNewScanner() {
+	fexpr.NewScanner(strings.NewReader("id"))
+}
+
+func ExampleScanner_Scan() {
+	s := fexpr.NewScanner(strings.NewReader("id > 123"))
+
+	for {
+		t, err := s.Scan()
+		if t.Type == fexpr.TokenEOF || err != nil {
+			break
+		}
+
+		fmt.Println(t)
+	}
+
+	// Output:
+	// {identifier id}
+	// {whitespace  }
+	// {sign >}
+	// {whitespace  }
+	// {number 123}
+}
+
+func ExampleParse() {
+	result, _ := fexpr.Parse("id > 123")
+
+	fmt.Println(result)
+
+	// Output:
+	// [{&& {{identifier id} > {number 123}}}]
+}
diff --git a/go.mod b/go.mod
new file mode 100644
index 0000000..641113c
--- /dev/null
+++ b/go.mod
@@ -0,0 +1,3 @@
+module fexpr
+
+go 1.18
diff --git a/parser.go b/parser.go
new file mode 100644
index 0000000..d79a5e8
--- /dev/null
+++ b/parser.go
@@ -0,0 +1,116 @@
+package fexpr
+
+import (
+	"errors"
+	"fmt"
+	"strings"
+)
+
+// Expr represents an individual tokenized expression consisting
+// of left operand, operator and a right operand.
+type Expr struct {
+	Left  Token
+	Op    SignOp
+	Right Token
+}
+
+// ExprGroup represents a wrapped expression and its join type.
+//
+// The group's Item could be either an `Expr` instance or `[]ExprGroup` slice (for nested expressions).
+type ExprGroup struct {
+	Join JoinOp
+	Item interface{}
+}
+
+// parser's state machine steps
+const (
+	stepBeforeSign = iota
+	stepSign
+	stepAfterSign
+	StepJoin
+)
+
+// Parse parses the provided text and returns its processed AST
+// in the form of `ExprGroup` slice(s).
+func Parse(text string) ([]ExprGroup, error) {
+	result := []ExprGroup{}
+	scanner := NewScanner(strings.NewReader(text))
+	step := stepBeforeSign
+	join := JoinAnd
+
+	var expr Expr
+
+	for {
+		t, err := scanner.Scan()
+		if err != nil {
+			return nil, err
+		}
+
+		if t.Type == TokenEOF {
+			break
+		}
+
+		if t.Type == TokenWS {
+			continue
+		}
+
+		if t.Type == TokenGroup {
+			groupResult, err := Parse(t.Literal)
+			if err != nil {
+				return nil, err
+			}
+
+			// append only if non-empyt group
+			if len(groupResult) > 0 {
+				result = append(result, ExprGroup{Join: join, Item: groupResult})
+			}
+
+			step = StepJoin
+			continue
+		}
+
+		switch step {
+		case stepBeforeSign:
+			if t.Type != TokenIdentifier && t.Type != TokenText && t.Type != TokenNumber {
+				return nil, fmt.Errorf("Expected left operand (identifier, text or number), got %q (%s)", t.Literal, t.Type)
+			}
+
+			expr = Expr{Left: t}
+
+			step = stepSign
+		case stepSign:
+			if t.Type != TokenSign {
+				return nil, fmt.Errorf("Expected a sign operator, got %q (%s)", t.Literal, t.Type)
+			}
+
+			expr.Op = SignOp(t.Literal)
+			step = stepAfterSign
+		case stepAfterSign:
+			if t.Type != TokenIdentifier && t.Type != TokenText && t.Type != TokenNumber {
+				return nil, fmt.Errorf("Expected right operand (identifier, text or number), got %q (%s)", t.Literal, t.Type)
+			}
+
+			expr.Right = t
+			result = append(result, ExprGroup{Join: join, Item: expr})
+
+			step = StepJoin
+		case StepJoin:
+			if t.Type != TokenJoin {
+				return nil, fmt.Errorf("Expected && or ||, got %q (%s)", t.Literal, t.Type)
+			}
+
+			join = JoinAnd
+			if t.Literal == "||" {
+				join = JoinOr
+			}
+
+			step = stepBeforeSign
+		}
+	}
+
+	if step != StepJoin {
+		return nil, errors.New("Invalid formatted filter expression.")
+	}
+
+	return result, nil
+}
diff --git a/parser_test.go b/parser_test.go
new file mode 100644
index 0000000..5553742
--- /dev/null
+++ b/parser_test.go
@@ -0,0 +1,97 @@
+package fexpr
+
+import (
+	"fmt"
+	"testing"
+)
+
+func TestParse(t *testing.T) {
+	testScenarios := []struct {
+		input         string
+		expectedError bool
+		expectedPrint string
+	}{
+		{`> 1`, true, "[]"},
+		{`a >`, true, "[]"},
+		{`a > >`, true, "[]"},
+		{`a > %`, true, "[]"},
+		{`a ! 1`, true, "[]"},
+		{`a - 1`, true, "[]"},
+		{`a + 1`, true, "[]"},
+		{`> a 1`, true, "[]"},
+		{`a || 1`, true, "[]"},
+		{`a && 1`, true, "[]"},
+		{`test > 1 &&`, true, `[]`},
+		{`|| test = 1`, true, `[]`},
+		{`test = 1 && ||`, true, "[]"},
+		{`test = 1 && a`, true, "[]"},
+		{`test = 1 && a`, true, "[]"},
+		{`test = 1 && "a"`, true, "[]"},
+		{`test = 1 a`, true, "[]"},
+		{`test = 1 a`, true, "[]"},
+		{`test = 1 "a"`, true, "[]"},
+		{`test = 1@test`, true, "[]"},
+		{`test = .@test`, true, "[]"},
+		// mismatched text quotes
+		{`test = "demo'`, true, "[]"},
+		{`test = 'demo"`, true, "[]"},
+		{`test = 'demo'"`, true, "[]"},
+		{`test = 'demo''`, true, "[]"},
+		{`test = "demo"'`, true, "[]"},
+		{`test = "demo""`, true, "[]"},
+		{`test = ""demo""`, true, "[]"},
+		{`test = ''demo''`, true, "[]"},
+		{"test = `demo`", true, "[]"},
+		// valid simple expression and sign operators check
+		{`1=12`, false, `[{&& {{number 1} = {number 12}}}]`},
+		{`   1    =    12    `, false, `[{&& {{number 1} = {number 12}}}]`},
+		{`"demo" != test`, false, `[{&& {{text demo} != {identifier test}}}]`},
+		{`a~1`, false, `[{&& {{identifier a} ~ {number 1}}}]`},
+		{`a !~ 1`, false, `[{&& {{identifier a} !~ {number 1}}}]`},
+		{`test>12`, false, `[{&& {{identifier test} > {number 12}}}]`},
+		{`test > 12`, false, `[{&& {{identifier test} > {number 12}}}]`},
+		{`test >="test"`, false, `[{&& {{identifier test} >= {text test}}}]`},
+		{`test<@demo.test2`, false, `[{&& {{identifier test} < {identifier @demo.test2}}}]`},
+		{`1<="test"`, false, `[{&& {{number 1} <= {text test}}}]`},
+		{`1<="te'st"`, false, `[{&& {{number 1} <= {text te'st}}}]`},
+		{`demo='te\'st'`, false, `[{&& {{identifier demo} = {text te'st}}}]`},
+		{`demo="te\'st"`, false, `[{&& {{identifier demo} = {text te\'st}}}]`},
+		{`demo="te\"st"`, false, `[{&& {{identifier demo} = {text te"st}}}]`},
+		// invalid parenthesis
+		{`(a=1`, true, `[]`},
+		{`a=1)`, true, `[]`},
+		{`((a=1)`, true, `[]`},
+		{`{a=1}`, true, `[]`},
+		{`[a=1]`, true, `[]`},
+		{`((a=1 || a=2) && c=1))`, true, `[]`},
+		// valid parenthesis
+		{`()`, true, `[]`},
+		{`(a=1)`, false, `[{&& [{&& {{identifier a} = {number 1}}}]}]`},
+		{`(a="test(")`, false, `[{&& [{&& {{identifier a} = {text test(}}}]}]`},
+		{`(a="test)")`, false, `[{&& [{&& {{identifier a} = {text test)}}}]}]`},
+		{`((a=1))`, false, `[{&& [{&& [{&& {{identifier a} = {number 1}}}]}]}]`},
+		{`a=1 || 2!=3`, false, `[{&& {{identifier a} = {number 1}}} {|| {{number 2} != {number 3}}}]`},
+		{`a=1 && 2!=3`, false, `[{&& {{identifier a} = {number 1}}} {&& {{number 2} != {number 3}}}]`},
+		{`a=1 && 2!=3 || "b"=a`, false, `[{&& {{identifier a} = {number 1}}} {&& {{number 2} != {number 3}}} {|| {{text b} = {identifier a}}}]`},
+		{`(a=1 && 2!=3) || "b"=a`, false, `[{&& [{&& {{identifier a} = {number 1}}} {&& {{number 2} != {number 3}}}]} {|| {{text b} = {identifier a}}}]`},
+		{`((a=1 || a=2) && (c=1))`, false, `[{&& [{&& [{&& {{identifier a} = {number 1}}} {|| {{identifier a} = {number 2}}}]} {&& [{&& {{identifier c} = {number 1}}}]}]}]`},
+	}
+
+	for i, scenario := range testScenarios {
+		v, err := Parse(scenario.input)
+
+		if scenario.expectedError && err == nil {
+			t.Errorf("(%d) Expected error, got nil (%q)", i, scenario.input)
+		}
+
+		if !scenario.expectedError && err != nil {
+			t.Errorf("(%d) Did not expect error, got %q (%q).", i, err, scenario.input)
+		}
+
+		vPrint := fmt.Sprintf("%v", v)
+
+		if vPrint != scenario.expectedPrint {
+			t.Errorf("(%d) Expected %s, got %s", i, scenario.expectedPrint, vPrint)
+		}
+	}
+}
diff --git a/scanner.go b/scanner.go
new file mode 100644
index 0000000..ea6b55b
--- /dev/null
+++ b/scanner.go
@@ -0,0 +1,479 @@
+package fexpr
+
+import (
+	"bufio"
+	"bytes"
+	"fmt"
+	"io"
+	"regexp"
+	"strconv"
+	"strings"
+)
+
+// eof represents a marker rune for the end of the reader.
+const eof = rune(0)
+
+// JoinOp represents a join type operator.
+type JoinOp string
+
+// supported join type operators
+const (
+	JoinAnd JoinOp = "&&"
+	JoinOr  JoinOp = "||"
+)
+
+// JoinOp represents an expression sign operator.
+type SignOp string
+
+// supported expression sign operators
+const (
+	SignEq    SignOp = "="
+	SignNeq   SignOp = "!="
+	SignLike  SignOp = "~"
+	SignNlike SignOp = "!~"
+	SignLt    SignOp = "<"
+	SignLte   SignOp = "<="
+	SignGt    SignOp = ">"
+	SignGte   SignOp = ">="
+)
+
+// TokenType represents a Token type.
+type TokenType string
+
+// token type constants
+const (
+	TokenUnexpected TokenType = "unexpected"
+	TokenEOF        TokenType = "eof"
+	TokenWS         TokenType = "whitespace"
+	TokenJoin       TokenType = "join"
+	TokenSign       TokenType = "sign"
+	TokenIdentifier TokenType = "identifier" // variable, column name, placeholder, etc.
+	TokenNumber     TokenType = "number"
+	TokenText       TokenType = "text"  // ' or " quoted string
+	TokenGroup      TokenType = "group" // groupped/nested tokens
+)
+
+// Token represents a single scanned literal (one or more combined runes).
+type Token struct {
+	Type    TokenType
+	Literal string
+}
+
+// Scanner represents a filter and lexical scanner.
+type Scanner struct {
+	r *bufio.Reader
+}
+
+// NewScanner creates and returns a new scanner instance with the specified io.Reader.
+func NewScanner(r io.Reader) *Scanner {
+	return &Scanner{bufio.NewReader(r)}
+}
+
+// Scan reads and returns the next available token value from the scanner's buffer.
+func (s *Scanner) Scan() (Token, error) {
+	ch := s.read()
+
+	if isWhitespaceRune(ch) {
+		s.unread()
+		return s.scanWhitespace()
+	}
+
+	if isGroupStartRune(ch) {
+		s.unread()
+		return s.scanGroup()
+	}
+
+	if isIdentifierStartRune(ch) {
+		s.unread()
+		return s.scanIdentifier()
+	}
+
+	if isNumberStartRune(ch) {
+		s.unread()
+		return s.scanNumber()
+	}
+
+	if isTextStartRune(ch) {
+		s.unread()
+		return s.scanText()
+	}
+
+	if isSignStartRune(ch) {
+		s.unread()
+		return s.scanSign()
+	}
+
+	if isJoinStartRune(ch) {
+		s.unread()
+		return s.scanJoin()
+	}
+
+	if ch == eof {
+		return Token{Type: TokenEOF, Literal: ""}, nil
+	}
+
+	return Token{Type: TokenUnexpected, Literal: string(ch)}, fmt.Errorf("Unexpected character %q", ch)
+}
+
+// scanWhitespace consumes all contiguous whitespace runes.
+func (s *Scanner) scanWhitespace() (Token, error) {
+	var buf bytes.Buffer
+
+	// Reads every subsequent whitespace character into the buffer.
+	// Non-whitespace runes and EOF will cause the loop to exit.
+	for {
+		ch := s.read()
+
+		if ch == eof {
+			break
+		}
+
+		if !isWhitespaceRune(ch) {
+			s.unread()
+			break
+		}
+
+		// write the whitespace rune
+		buf.WriteRune(ch)
+	}
+
+	return Token{Type: TokenWS, Literal: buf.String()}, nil
+}
+
+// scanIdentifier consumes all contiguous ident runes.
+func (s *Scanner) scanIdentifier() (Token, error) {
+	var buf bytes.Buffer
+
+	// Read every subsequent identifier rune into the buffer.
+	// Non-ident runes and EOF will cause the loop to exit.
+	for {
+		ch := s.read()
+
+		if ch == eof {
+			break
+		}
+
+		if !isIdentifierStartRune(ch) && !isDigitRune(ch) && ch != '.' {
+			s.unread()
+			break
+		}
+
+		// write the ident rune
+		buf.WriteRune(ch)
+	}
+
+	literal := buf.String()
+
+	var err error
+	if !isIdentifier(literal) {
+		err = fmt.Errorf("Invalid identifier %q", literal)
+	}
+
+	return Token{Type: TokenIdentifier, Literal: literal}, err
+}
+
+// scanNumber consumes all contiguous digit runes.
+func (s *Scanner) scanNumber() (Token, error) {
+	var buf bytes.Buffer
+
+	// Read every subsequent digit rune into the buffer.
+	// Non-digit runes and EOF will cause the loop to exit.
+	for {
+		ch := s.read()
+
+		if ch == eof {
+			break
+		}
+
+		if !isDigitRune(ch) && ch != '.' {
+			s.unread()
+			break
+		}
+
+		// write the digit rune
+		buf.WriteRune(ch)
+	}
+
+	literal := buf.String()
+
+	var err error
+	if !isNumber(literal) {
+		err = fmt.Errorf("Invalid number %q", literal)
+	}
+
+	return Token{Type: TokenNumber, Literal: literal}, err
+}
+
+// scanText consumes all contiguous quoted text runes.
+func (s *Scanner) scanText() (Token, error) {
+	var buf bytes.Buffer
+
+	// read the first rune to determine the quotes type
+	firstCh := s.read()
+	buf.WriteRune(firstCh)
+	var prevCh rune
+	var hasMatchingQuotes bool
+
+	// Read every subsequent text rune into the buffer.
+	// EOF and matching unescaped ending quote will cause the loop to exit.
+	for {
+		ch := s.read()
+
+		if ch == eof {
+			break
+		}
+
+		// write the text rune
+		buf.WriteRune(ch)
+
+		// unescaped matching quote, aka. the end
+		if ch == firstCh && prevCh != '\\' {
+			hasMatchingQuotes = true
+			break
+		}
+
+		prevCh = ch
+	}
+
+	literal := buf.String()
+
+	var err error
+	if !hasMatchingQuotes {
+		err = fmt.Errorf("Invalid quoted text %q", literal)
+	} else {
+		// unquote
+		literal = literal[1 : len(literal)-1]
+		// remove escaped quotes prefix (aka. \)
+		firstChStr := string(firstCh)
+		literal = strings.Replace(literal, `\`+firstChStr, firstChStr, -1)
+	}
+
+	return Token{Type: TokenText, Literal: literal}, err
+}
+
+// scanSign consumes all contiguous sign operator runes.
+func (s *Scanner) scanSign() (Token, error) {
+	var buf bytes.Buffer
+
+	// Read every subsequent sign rune into the buffer.
+	// Non-sign runes and EOF will cause the loop to exit.
+	for {
+		ch := s.read()
+
+		if ch == eof {
+			break
+		}
+
+		if !isSignStartRune(ch) {
+			s.unread()
+			break
+		}
+
+		// write the sign rune
+		buf.WriteRune(ch)
+	}
+
+	literal := buf.String()
+
+	var err error
+	if !isSignOperator(literal) {
+		err = fmt.Errorf("Invalid sign operator %q", literal)
+	}
+
+	return Token{Type: TokenSign, Literal: literal}, err
+}
+
+// scanJoin consumes all contiguous join operator runes.
+func (s *Scanner) scanJoin() (Token, error) {
+	var buf bytes.Buffer
+
+	// Read every subsequent join operator rune into the buffer.
+	// Non-join runes and EOF will cause the loop to exit.
+	for {
+		ch := s.read()
+
+		if ch == eof {
+			break
+		}
+
+		if !isJoinStartRune(ch) {
+			s.unread()
+			break
+		}
+
+		// write the join operator rune
+		buf.WriteRune(ch)
+	}
+
+	literal := buf.String()
+
+	var err error
+	if !isJoinOperator(literal) {
+		err = fmt.Errorf("Invalid join operator %q", literal)
+	}
+
+	return Token{Type: TokenJoin, Literal: literal}, err
+}
+
+// scanGroup consumes all runes within a group/parenthesis.
+func (s *Scanner) scanGroup() (Token, error) {
+	var buf bytes.Buffer
+
+	// read the first group bracket without writting it to the buffer
+	firstChar := s.read()
+	openGroups := 1
+
+	// Read every subsequent text rune into the buffer.
+	// EOF and matching unescaped ending quote will cause the loop to exit.
+	for {
+		ch := s.read()
+
+		if ch == eof {
+			break
+		}
+
+		if isGroupStartRune(ch) {
+			// nested group
+			openGroups++
+			buf.WriteRune(ch)
+		} else if isTextStartRune(ch) {
+			s.unread()
+			t, err := s.scanText()
+			if err != nil {
+				// write the errored literal as it is
+				buf.WriteString(t.Literal)
+				return Token{Type: TokenGroup, Literal: buf.String()}, err
+			}
+
+			// quote the literal to preserve the text start/end runes
+			buf.WriteString("\"" + t.Literal + "\"")
+		} else if ch == ')' {
+			openGroups--
+
+			if openGroups <= 0 {
+				// main group end
+				break
+			} else {
+				buf.WriteRune(ch)
+			}
+		} else {
+			buf.WriteRune(ch)
+		}
+	}
+
+	literal := buf.String()
+
+	var err error
+	if !isGroupStartRune(firstChar) || openGroups > 0 {
+		err = fmt.Errorf("Invalid formatted group - missing %d closing bracket(s).", openGroups)
+	}
+
+	return Token{Type: TokenGroup, Literal: literal}, err
+}
+
+// read reads the next rune from the buffered reader.
+// Returns the `rune(0)` if an error or `io.EOF` occurs.
+func (s *Scanner) read() rune {
+	ch, _, err := s.r.ReadRune()
+	if err != nil {
+		return eof
+	}
+	return ch
+}
+
+// unread places the previously read rune back on the reader.
+func (s *Scanner) unread() error {
+	return s.r.UnreadRune()
+}
+
+// Lexical helpers:
+// -------------------------------------------------------------------
+
+// isWhitespaceRune checks if a rune is a space, tab, or newline.
+func isWhitespaceRune(ch rune) bool { return ch == ' ' || ch == '\t' || ch == '\n' }
+
+// isLetterRune checks if a rune is a letter.
+func isLetterRune(ch rune) bool {
+	return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z')
+}
+
+// isDigitRune checks if a rune is a digit.
+func isDigitRune(ch rune) bool {
+	return (ch >= '0' && ch <= '9')
+}
+
+// isIdentifierStartRune checks if a rune is valid identifier's first character.
+func isIdentifierStartRune(ch rune) bool {
+	return isLetterRune(ch) || ch == '_' || ch == '@' || ch == '#'
+}
+
+// isTextStartRune checks if a rune is a valid quoted text first character
+// (aka. single or double quote).
+func isTextStartRune(ch rune) bool {
+	return ch == '\'' || ch == '"'
+}
+
+// isNumberStartRune checks if a rune is a valid number start character (aka. digit).
+func isNumberStartRune(ch rune) bool {
+	return isDigitRune(ch)
+}
+
+// isSignStartRune checks if a rune is a valid sign operator start character.
+func isSignStartRune(ch rune) bool {
+	return ch == '=' ||
+		ch == '!' ||
+		ch == '>' ||
+		ch == '<' ||
+		ch == '~'
+}
+
+// isJoinStartRune checks if a rune is a valid join type start character.
+func isJoinStartRune(ch rune) bool {
+	return ch == '&' || ch == '|'
+}
+
+// isGroupStartRune checks if a rune is a valid group/parenthesis start character.
+func isGroupStartRune(ch rune) bool {
+	return ch == '('
+}
+
+// isSignOperator checks if a literal is a valid sign operator.
+func isSignOperator(literal string) bool {
+	op := SignOp(literal)
+
+	return op == SignEq ||
+		op == SignNeq ||
+		op == SignLt ||
+		op == SignLte ||
+		op == SignGt ||
+		op == SignGte ||
+		op == SignLike ||
+		op == SignNlike
+}
+
+// isJoinOperator checks if a literal is a valid join type operator.
+func isJoinOperator(literal string) bool {
+	op := JoinOp(literal)
+
+	return op == JoinAnd || op == JoinOr
+}
+
+// isNumber checks if a literal is numeric.
+func isNumber(literal string) bool {
+	// strconv.ParseFloat() considers numerics with dot suffix
+	// a valid floating point number (eg. "123."), but we don't want this
+	if literal == "" || literal[len(literal)-1] == '.' {
+		return false
+	}
+
+	_, err := strconv.ParseFloat(literal, 64)
+
+	return err == nil
+}
+
+var identifierRegex = regexp.MustCompile(`^[\w\.\@\#]*\w+$`)
+
+// isIdentifier checks if a literal is properly formatted identifier.
+func isIdentifier(literal string) bool {
+	return identifierRegex.MatchString(literal)
+}
diff --git a/scanner_test.go b/scanner_test.go
new file mode 100644
index 0000000..f53ec27
--- /dev/null
+++ b/scanner_test.go
@@ -0,0 +1,116 @@
+package fexpr
+
+import (
+	"fmt"
+	"strings"
+	"testing"
+)
+
+func TestNewScanner(t *testing.T) {
+	s := NewScanner(strings.NewReader("test"))
+	dataBytes, _ := s.r.Peek(4)
+	data := string(dataBytes)
+
+	if data != "test" {
+		t.Errorf("Expected the scanner reader data to be %q, got %q", "test", data)
+	}
+}
+
+func TestScannerScan(t *testing.T) {
+	type output struct {
+		error bool
+		print string
+	}
+	testScenarios := []struct {
+		text    string
+		expects []output
+	}{
+		// whitespace
+		{"   ", []output{{false, "{whitespace    }"}}},
+		{"test 123", []output{{false, "{identifier test}"}, {false, "{whitespace  }"}, {false, "{number 123}"}}},
+		// identifier
+		{`test`, []output{{false, `{identifier test}`}}},
+		{`@test.123`, []output{{false, `{identifier @test.123}`}}},
+		{`_test.123`, []output{{false, `{identifier _test.123}`}}},
+		{`#test.123`, []output{{false, `{identifier #test.123}`}}},
+		{`.test.123`, []output{{true, `{unexpected .}`}, {false, `{identifier test.123}`}}},
+		{`test#@`, []output{{true, `{identifier test#@}`}}},
+		{`test'`, []output{{false, `{identifier test}`}, {true, `{text '}`}}},
+		{`test"d`, []output{{false, `{identifier test}`}, {true, `{text "d}`}}},
+		// number
+		{`123`, []output{{false, `{number 123}`}}},
+		{`123.123`, []output{{false, `{number 123.123}`}}},
+		{`.123`, []output{{true, `{unexpected .}`}, {false, `{number 123}`}}},
+		{`123.abc`, []output{{true, `{number 123.}`}, {false, `{identifier abc}`}}},
+		// text
+		{`""`, []output{{false, `{text }`}}},
+		{`''`, []output{{false, `{text }`}}},
+		{`'test'`, []output{{false, `{text test}`}}},
+		{`'te\'st'`, []output{{false, `{text te'st}`}}},
+		{`"te\"st"`, []output{{false, `{text te"st}`}}},
+		{`"tes@#,;!@#%^'\"t"`, []output{{false, `{text tes@#,;!@#%^'"t}`}}},
+		{`'tes@#,;!@#%^\'"t'`, []output{{false, `{text tes@#,;!@#%^'"t}`}}},
+		{`"test`, []output{{true, `{text "test}`}}},
+		{`'test`, []output{{true, `{text 'test}`}}},
+		// join types
+		{`&&||`, []output{{true, `{join &&||}`}}},
+		{`&& ||`, []output{{false, `{join &&}`}, {false, `{whitespace  }`}, {false, `{join ||}`}}},
+		{`'||test&&'&&123`, []output{{false, `{text ||test&&}`}, {false, `{join &&}`}, {false, `{number 123}`}}},
+		// expression signs
+		{`=!=`, []output{{true, `{sign =!=}`}}},
+		{`= != ~ !~ > >= < <=`, []output{
+			{false, `{sign =}`},
+			{false, `{whitespace  }`},
+			{false, `{sign !=}`},
+			{false, `{whitespace  }`},
+			{false, `{sign ~}`},
+			{false, `{whitespace  }`},
+			{false, `{sign !~}`},
+			{false, `{whitespace  }`},
+			{false, `{sign >}`},
+			{false, `{whitespace  }`},
+			{false, `{sign >=}`},
+			{false, `{whitespace  }`},
+			{false, `{sign <}`},
+			{false, `{whitespace  }`},
+			{false, `{sign <=}`},
+		}},
+		// groups/parenthesis
+		{`a)`, []output{{false, `{identifier a}`}, {true, `{unexpected )}`}}},
+		{`(a b c`, []output{{true, `{group a b c}`}}},
+		{`(a b c)`, []output{{false, `{group a b c}`}}},
+		{`((a b c))`, []output{{false, `{group (a b c)}`}}},
+		{`((a )b c))`, []output{{false, `{group (a )b c}`}, {true, `{unexpected )}`}}},
+		{`("ab)("c)`, []output{{false, `{group "ab)("c}`}}},
+		{`("ab)(c)`, []output{{true, `{group "ab)(c)}`}}},
+	}
+
+	for i, scenario := range testScenarios {
+		s := NewScanner(strings.NewReader(scenario.text))
+
+		// scan the text tokens
+		for j, expect := range scenario.expects {
+			token, err := s.Scan()
+
+			if expect.error && err == nil {
+				t.Errorf("(%d.%d) Expected error, got nil (%q)", i, j, scenario.text)
+			}
+
+			if !expect.error && err != nil {
+				t.Errorf("(%d.%d) Did not expect error, got %s (%q)", i, j, err, scenario.text)
+			}
+
+			tokenPrint := fmt.Sprintf("%v", token)
+
+			if tokenPrint != expect.print {
+				t.Errorf("(%d.%d) Expected token %s, got %s", i, j, expect.print, tokenPrint)
+			}
+		}
+
+		// the last remaining token should be the eof
+		lastToken, err := s.Scan()
+		if err != nil || lastToken.Type != TokenEOF {
+			t.Errorf("(%d) Expected EOF token, got %v (%v)", i, lastToken, err)
+		}
+	}
+}