Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Weird Parser Error #117

Merged
merged 4 commits into from
Jan 23, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
File renamed without changes.
File renamed without changes.
File renamed without changes.
73 changes: 41 additions & 32 deletions fixer_v2/query/buffer.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
package query

import (
"errors"
_ "errors"
"fmt"
"io"
"strings"
Expand All @@ -21,6 +21,7 @@ type buffer struct {
last States // Previous state
state States // Current state
class Classes // Character class of current byte
mode CharClassMode

tokenStart int // Starting position of current token
tokenValue strings.Builder // Accumulates characters for current token
Expand All @@ -35,9 +36,14 @@ func newBuffer(input string) *buffer {
index: 0,
last: GO,
state: GO,
mode: ModeText,
}
}

func (b *buffer) setMode(mode CharClassMode) {
b.mode = mode
}

// startToken begins accumulating a new token by recording the start position
// and resetting the token value builder. This should be called at the start
// of parsing any new token.
Expand All @@ -52,7 +58,7 @@ func (b *buffer) getClass() Classes {
if b.index >= b.length {
return C_OTHER
}
return getCharacterClass(b.data[b.index])
return getCharacterClass(b.data[b.index], b.mode)
}

// transition performs a state transition based on the current character and state.
Expand Down Expand Up @@ -125,42 +131,45 @@ func (b *buffer) parseMetaVariable() (*HoleConfig, error) {
return nil, fmt.Errorf("incomplete meta variable at position %d", b.tokenStart)
}

// parseText parses regular text content until a special character or meta-variable
// pattern is encountered. Handles both regular text and whitespace.
//
// The parsing process:
// 1. Accumulates characters while in TX or WS states
// 2. Stops at special characters (CL, OB, DB states)
// 3. Returns accumulated text or error if no text found
// parseText collects and returns text from the current index
// until it encounters a 'boundary character' (e.g., :, [, ], {, }, *) or EOF.
// Implemented using a 'peek' approach to look at the next character.
func (b *buffer) parseText() (string, error) {
if len(b.data) == 0 {
return "", nil
}

b.startToken()
b.setMode(ModeText)

// process as text until boundary character appears
for b.index < b.length {
state, err := b.transition()
if err != nil && !errors.Is(err, io.EOF) {
return "", err
}

currentChar := b.data[b.index]

// stop at special characters or meta-variable start
if state == CL || state == OB || state == DB {
break
}

// handle whitespace or regular text
if state == TX || state == WS {
b.tokenValue.WriteByte(currentChar)
class := b.getClass()

switch class {
// current character is for metavar start/end or block delimiter => end text segment
case C_COLON, C_LBRACK, C_RBRACK, C_LBRACE, C_RBRACE, C_QUANT:
// breaking here leaves the character unconsumed,
// so it will be processed by next token (metavar etc.)
goto DONE

case C_SPACE:
// TODO (@notJoon): Decide whether to treat whitespace as part of text or separate WS token.
// If you want "all WS in Text token", handle same as default below
// or, if you want separate TokenWhitespace, break here
fallthrough

default:
// accumulate regular characters as text
b.tokenValue.WriteByte(b.data[b.index])
b.index++
continue
}

break
}

if b.tokenValue.Len() == 0 {
return "", fmt.Errorf("no text found at position %d", b.tokenStart)
}

return b.tokenValue.String(), nil
DONE:
// end of text segment
text := b.tokenValue.String()
// TODO (@notJoon): Return even if length 0
// skip empty tokens if needed
return text, nil
}
10 changes: 5 additions & 5 deletions fixer_v2/query/buffer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -332,16 +332,16 @@ func TestBuffer_ParseText(t *testing.T) {
wantErr: false,
},
{
name: "text between metavariables",
input: ":[var1]middle:[var2]",
want: "",
wantErr: true,
name: "if cond with metavariable",
input: "if :[cond] {}",
want: "if ",
wantErr: false,
},
{
name: "empty input",
input: "",
want: "",
wantErr: true,
wantErr: false,
},
}

Expand Down
184 changes: 56 additions & 128 deletions fixer_v2/query/internal.go
Original file line number Diff line number Diff line change
@@ -1,10 +1,5 @@
package query

import (
"fmt"
"strings"
)

/*
State Transition Machine Design Rationale

Expand Down Expand Up @@ -117,29 +112,20 @@ const (
// 4. TX (text) state allows transitioning back to pattern parsing
var StateTransitionTable = [14][9]States{
// COLON LBRACK RBRACK LBRACE RBRACE SPACE IDENT QUANT OTHER
/* GO 0*/ { CL, OB, ER, BR, BR, WS, TX, ER, ER },
/* OK 1*/ { CL, OB, ER, BR, BR, WS, TX, ER, ER },
/* CL 2*/ { TX, OB, ER, ER, ER, ER, ID, ER, ER },
/* OB 3*/ { TX, DB, ER, ER, ER, ER, NM, ER, ER },
/* DB 4*/ { TX, ER, ER, ER, ER, ER, NM, ER, ER },
/* NM 5*/ { ID, ER, CB, ER, ER, ER, NM, ER, ER },
/* ID 6*/ { ER, ER, CB, ER, ER, ER, ID, ER, ER },
/* CB 7*/ { OK, ER, QB, ER, ER, WS, TX, QT, ER },
/* QB 8*/ { OK, ER, ER, ER, ER, WS, TX, QT, ER },
/* QT 9*/ { CL, ER, ER, BR, BR, WS, TX, ER, ER },
/* TX10*/ { CL, ER, ER, BR, BR, WS, TX, ER, ER },
/* WS11*/ { CL, ER, ER, BR, BR, WS, TX, ER, ER },
/* BR12*/ { CL, ER, ER, BR, OK, WS, TX, ER, ER },
/* ER13*/ { ER, ER, ER, ER, ER, ER, ER, ER, ER },
}

// isFinalState determines whether a given state is a final (accepting) state.
func isFinalState(s States) bool {
switch s {
case OK, QB, QT, TX:
return true
}
return false
/* GO 0*/ {CL, OB, ER, BR, BR, WS, TX, ER, ER},
/* OK 1*/ {CL, OB, ER, BR, BR, WS, TX, ER, ER},
/* CL 2*/ {TX, OB, ER, ER, ER, ER, ID, ER, ER},
/* OB 3*/ {TX, DB, ER, ER, ER, ER, NM, ER, ER},
/* DB 4*/ {TX, ER, ER, ER, ER, ER, NM, ER, ER},
/* NM 5*/ {ID, ER, CB, ER, ER, ER, NM, ER, ER},
/* ID 6*/ {ER, ER, CB, ER, ER, ER, ID, ER, ER},
/* CB 7*/ {OK, ER, QB, ER, ER, WS, TX, QT, ER},
/* QB 8*/ {OK, ER, ER, ER, ER, WS, TX, QT, ER},
/* QT 9*/ {CL, ER, ER, BR, BR, WS, TX, ER, ER},
/* TX10*/ {CL, OB, CB, BR, BR, WS, TX, QT, ER},
/* WS11*/ {CL, ER, ER, BR, BR, WS, TX, ER, ER},
/* BR12*/ {CL, ER, ER, BR, OK, WS, TX, ER, ER},
/* ER13*/ {ER, ER, ER, ER, ER, ER, ER, ER, ER},
}

func (c Classes) String() string {
Expand Down Expand Up @@ -167,102 +153,18 @@ func (c Classes) String() string {
}
}

// StateMachine represents the parser's state machine
type StateMachine struct {
state States // Current state
input string // Input pattern to parse
position int // Current position in input
}
// mode for character classification based on context
type CharClassMode int

func NewStateMachine(input string) *StateMachine {
return &StateMachine{
state: GO,
input: input,
position: 0,
}
}

// Transition records the transition details between states
type Transition struct {
char byte
fromState States
class Classes
toState States
pos int // Position in input
}

func (sm *StateMachine) recordTransitions() []Transition {
var transitions []Transition

for sm.position < len(sm.input) {
c := sm.input[sm.position]
class := getCharacterClass(c)
currentState := sm.state
nextState := StateTransitionTable[currentState][class]

transitions = append(transitions, Transition{
char: c,
fromState: currentState,
class: class,
toState: nextState,
pos: sm.position,
})

sm.state = nextState
sm.position++

// if we reach an OK state in middle, we can consider that we have finished one token (e.g., metavariable)
// and reset the state to GO to continue recognizing the next token
if sm.state == OK {
sm.state = GO
}
}

return transitions
}

// recordTransitionsStrict processes the input and ensures that the final state is valid
func (sm *StateMachine) recordTransitionsStrict() ([]Transition, error) {
transitions := sm.recordTransitions()

// If the final state is CB, check if it was reached from NM (single-bracketed)
if sm.state == CB {
lastTransition := transitions[len(transitions)-1]
if lastTransition.fromState == NM {
sm.state = OK
return transitions, nil
}
return transitions, fmt.Errorf("incomplete parse: ended in CB from state %v", lastTransition.fromState)
}

// If the final state is one of the final states, accept
if isFinalState(sm.state) {
sm.state = OK
return transitions, nil
}

// Check if the state is ERROR
if sm.state == ER {
return transitions, fmt.Errorf("invalid parse: reached ERROR state")
}

// Otherwise, it's an incomplete parse
return transitions, fmt.Errorf("incomplete parse: ended in state %v", sm.state)
}

func visualizeTransitions(transitions []Transition) string {
var b strings.Builder
for _, t := range transitions {
fmt.Fprintf(&b, "%c: %v -%v-> %v\n",
t.char, t.fromState, t.class, t.toState)
}
return b.String()
}
const (
ModeText CharClassMode = iota // normal text mode
ModeHole // metavariable hole
)

// getCharacterClass determines the character class for a given byte
// Handles special characters, whitespace, and identifier characters
// Returns C_OTHER for any character that doesn't fit other categories
func getCharacterClass(c byte) Classes {
func getCharacterClass(c byte, mode CharClassMode) Classes {
// Check special characters first
switch c {
case ':':
Expand All @@ -284,20 +186,46 @@ func getCharacterClass(c byte) Classes {
return C_SPACE
}

// Check for identifier characters
if isIdentChar(c) {
return C_IDENT
switch mode {
case ModeHole:
// in metavariable hole, we allow only identifier characters
if isIdentChar(c) {
return C_IDENT
}
return C_OTHER
default:
return C_IDENT // text mode allows all characters
}
}

return C_OTHER
var identCharTable = [256]bool{
// lowercase (a-z)
'a': true, 'b': true, 'c': true, 'd': true, 'e': true,
'f': true, 'g': true, 'h': true, 'i': true, 'j': true,
'k': true, 'l': true, 'm': true, 'n': true, 'o': true,
'p': true, 'q': true, 'r': true, 's': true, 't': true,
'u': true, 'v': true, 'w': true, 'x': true, 'y': true,
'z': true,

// uppercase (A-Z)
'A': true, 'B': true, 'C': true, 'D': true, 'E': true,
'F': true, 'G': true, 'H': true, 'I': true, 'J': true,
'K': true, 'L': true, 'M': true, 'N': true, 'O': true,
'P': true, 'Q': true, 'R': true, 'S': true, 'T': true,
'U': true, 'V': true, 'W': true, 'X': true, 'Y': true,
'Z': true,

// numbers (0-9)
'0': true, '1': true, '2': true, '3': true, '4': true,
'5': true, '6': true, '7': true, '8': true, '9': true,

// special characters
'_': true,
'-': true,
}

// isIdentChar checks if a character is valid in an identifier
// Allows: alphanumeric, underscore, and hyphen (comby-specific)
func isIdentChar(c byte) bool {
return ('a' <= c && c <= 'z') ||
('A' <= c && c <= 'Z') ||
('0' <= c && c <= '9') ||
c == '_' ||
c == '-' // Comby syntax allows hyphens in identifiers
return identCharTable[c]
}
Loading
Loading