Skip to content

Commit

Permalink
wordbreaks: fix open quotes
Browse files Browse the repository at this point in the history
  • Loading branch information
rsteube committed Dec 3, 2023
1 parent 0bd041d commit a876c4b
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 29 deletions.
21 changes: 14 additions & 7 deletions shlex.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,13 @@ func (l LexerState) MarshalJSON() ([]byte, error) {

// Token is a (type, value) pair representing a lexographical token.
type Token struct {
Type TokenType
Value string
RawValue string
Index int
State LexerState
WordbreakType WordbreakType `json:",omitempty"`
Type TokenType
Value string
RawValue string
Index int
State LexerState
WordbreakType WordbreakType `json:",omitempty"`
WordbreakIndex int
}

func (t *Token) add(r rune) {
Expand Down Expand Up @@ -61,7 +62,8 @@ func (t *Token) Equal(other *Token) bool {
t.RawValue != other.RawValue,
t.Index != other.Index,
t.State != other.State,
t.WordbreakType != other.WordbreakType:
t.WordbreakType != other.WordbreakType,
t.WordbreakIndex != other.WordbreakIndex:
return false
default:
return true
Expand Down Expand Up @@ -278,9 +280,11 @@ func (t *tokenizer) scanStream() (*Token, error) {
case escapingQuoteRuneClass:
token.Type = WORD_TOKEN
t.state = QUOTING_ESCAPING_STATE
token.WordbreakIndex = t.index
case nonEscapingQuoteRuneClass:
token.Type = WORD_TOKEN
t.state = QUOTING_STATE
token.WordbreakIndex = t.index
case escapeRuneClass:
token.Type = WORD_TOKEN
t.state = ESCAPING_STATE
Expand Down Expand Up @@ -318,8 +322,10 @@ func (t *tokenizer) scanStream() (*Token, error) {
return token, err
case escapingQuoteRuneClass:
t.state = QUOTING_ESCAPING_STATE
token.WordbreakIndex = t.index
case nonEscapingQuoteRuneClass:
t.state = QUOTING_STATE
token.WordbreakIndex = t.index
case escapeRuneClass:
t.state = ESCAPING_STATE
default:
Expand All @@ -341,6 +347,7 @@ func (t *tokenizer) scanStream() (*Token, error) {
return token, err
default:
t.state = QUOTING_ESCAPING_STATE
token.WordbreakIndex = t.index
token.add(nextRune)
}
case QUOTING_ESCAPING_STATE: // in escaping double quotes
Expand Down
42 changes: 21 additions & 21 deletions shlex_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,27 +50,27 @@ func init() {
func TestTokenizer(t *testing.T) {
testInput := strings.NewReader(testString)
expectedTokens := []*Token{
{WORD_TOKEN, "one", "one", 0, IN_WORD_STATE, WORDBREAK_UNKNOWN},
{WORD_TOKEN, "two", "two", 4, IN_WORD_STATE, WORDBREAK_UNKNOWN},
{WORD_TOKEN, "three four", "\"three four\"", 8, IN_WORD_STATE, WORDBREAK_UNKNOWN},
{WORD_TOKEN, "five \"six\"", "\"five \\\"six\\\"\"", 21, IN_WORD_STATE, WORDBREAK_UNKNOWN},
{WORD_TOKEN, "seven#eight", "seven#eight", 36, IN_WORD_STATE, WORDBREAK_UNKNOWN},
{COMMENT_TOKEN, " nine # ten", "# nine # ten", 48, START_STATE, WORDBREAK_UNKNOWN},
{WORD_TOKEN, "eleven", "eleven", 62, IN_WORD_STATE, WORDBREAK_UNKNOWN},
{WORD_TOKEN, "twelve\\", "'twelve\\'", 69, IN_WORD_STATE, WORDBREAK_UNKNOWN},
{WORD_TOKEN, "thirteen", "thirteen", 79, IN_WORD_STATE, WORDBREAK_UNKNOWN},
{WORDBREAK_TOKEN, "=", "=", 87, WORDBREAK_STATE, WORDBREAK_UNKNOWN},
{WORD_TOKEN, "13", "13", 88, IN_WORD_STATE, WORDBREAK_UNKNOWN},
{WORD_TOKEN, "fourteen/14", "fourteen/14", 91, IN_WORD_STATE, WORDBREAK_UNKNOWN},
{WORDBREAK_TOKEN, "|", "|", 103, WORDBREAK_STATE, WORDBREAK_PIPE},
{WORDBREAK_TOKEN, "||", "||", 105, WORDBREAK_STATE, WORDBREAK_LIST_OR},
{WORDBREAK_TOKEN, "|", "|", 108, WORDBREAK_STATE, WORDBREAK_PIPE},
{WORD_TOKEN, "after", "after", 109, IN_WORD_STATE, WORDBREAK_UNKNOWN},
{WORD_TOKEN, "before", "before", 115, IN_WORD_STATE, WORDBREAK_UNKNOWN},
{WORDBREAK_TOKEN, "|", "|", 121, WORDBREAK_STATE, WORDBREAK_PIPE},
{WORDBREAK_TOKEN, "&", "&", 123, WORDBREAK_STATE, WORDBREAK_LIST_ASYNC},
{WORDBREAK_TOKEN, ";", ";", 125, WORDBREAK_STATE, WORDBREAK_LIST_SEQUENTIAL},
{WORD_TOKEN, "", "", 126, START_STATE, WORDBREAK_UNKNOWN},
{WORD_TOKEN, "one", "one", 0, IN_WORD_STATE, WORDBREAK_UNKNOWN, 0},
{WORD_TOKEN, "two", "two", 4, IN_WORD_STATE, WORDBREAK_UNKNOWN, 0},
{WORD_TOKEN, "three four", "\"three four\"", 8, IN_WORD_STATE, WORDBREAK_UNKNOWN, 9},
{WORD_TOKEN, "five \"six\"", "\"five \\\"six\\\"\"", 21, IN_WORD_STATE, WORDBREAK_UNKNOWN, 34},
{WORD_TOKEN, "seven#eight", "seven#eight", 36, IN_WORD_STATE, WORDBREAK_UNKNOWN, 0},
{COMMENT_TOKEN, " nine # ten", "# nine # ten", 48, START_STATE, WORDBREAK_UNKNOWN, 0},
{WORD_TOKEN, "eleven", "eleven", 62, IN_WORD_STATE, WORDBREAK_UNKNOWN, 0},
{WORD_TOKEN, "twelve\\", "'twelve\\'", 69, IN_WORD_STATE, WORDBREAK_UNKNOWN, 70},
{WORD_TOKEN, "thirteen", "thirteen", 79, IN_WORD_STATE, WORDBREAK_UNKNOWN, 0},
{WORDBREAK_TOKEN, "=", "=", 87, WORDBREAK_STATE, WORDBREAK_UNKNOWN, 0},
{WORD_TOKEN, "13", "13", 88, IN_WORD_STATE, WORDBREAK_UNKNOWN, 0},
{WORD_TOKEN, "fourteen/14", "fourteen/14", 91, IN_WORD_STATE, WORDBREAK_UNKNOWN, 0},
{WORDBREAK_TOKEN, "|", "|", 103, WORDBREAK_STATE, WORDBREAK_PIPE, 0},
{WORDBREAK_TOKEN, "||", "||", 105, WORDBREAK_STATE, WORDBREAK_LIST_OR, 0},
{WORDBREAK_TOKEN, "|", "|", 108, WORDBREAK_STATE, WORDBREAK_PIPE, 0},
{WORD_TOKEN, "after", "after", 109, IN_WORD_STATE, WORDBREAK_UNKNOWN, 0},
{WORD_TOKEN, "before", "before", 115, IN_WORD_STATE, WORDBREAK_UNKNOWN, 0},
{WORDBREAK_TOKEN, "|", "|", 121, WORDBREAK_STATE, WORDBREAK_PIPE, 0},
{WORDBREAK_TOKEN, "&", "&", 123, WORDBREAK_STATE, WORDBREAK_LIST_ASYNC, 0},
{WORDBREAK_TOKEN, ";", ";", 125, WORDBREAK_STATE, WORDBREAK_LIST_SEQUENTIAL, 0},
{WORD_TOKEN, "", "", 126, START_STATE, WORDBREAK_UNKNOWN, 0},
}

tokenizer := newTokenizer(testInput)
Expand Down
13 changes: 12 additions & 1 deletion tokenslice.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package shlex

import "strconv"
import (
"strconv"
)

type TokenSlice []Token

Expand Down Expand Up @@ -93,6 +95,15 @@ func (t TokenSlice) CurrentToken() (token Token) {
func (t TokenSlice) WordbreakPrefix() string {
found := false
prefix := ""

last := t[len(t)-1]
switch last.State {
case QUOTING_STATE, QUOTING_ESCAPING_STATE, ESCAPING_QUOTED_STATE:
found = true
// TODO add value up to last opening quote to prefix
prefix = last.RawValue[:last.WordbreakIndex-last.Index-1] // TODO test - this is wrong (needs to be value up to rawvalue index -> just rescan the substring)
}

for i := len(t) - 2; i >= 0; i-- {
token := t[i]
if !token.adjoins(t[i+1]) {
Expand Down

0 comments on commit a876c4b

Please sign in to comment.