forked from johnkerl/miller
-
Notifications
You must be signed in to change notification settings - Fork 0
/
regex.go
528 lines (474 loc) · 16 KB
/
regex.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
// ================================================================
// Support for regular expressions in Miller.
//
// * By and large we use the Go library.
//
// * There is (for historical reasons) a DSL syntax "[a-z]"i (note the trailing i)
// for case-insensitive regular expressions which we map into Go syntax for
// regex-compilation.
//
// * Also for historical reasons, we allow things like
// if ($x =~ "(..)_(...)") {
// ... other lines of code ...
// $y = "\2:\1";
// }
// where the '=~' sets the captures and the "\2:\1" uses them. (Note that
// https://github.com/johnkerl/miller/issues/388 has a better suggestion which would make the
// captures explicit as variables, rather than implicit within CST state: this is implemented by
// the `strmatch` and `strmatchx` DSL functions. Regardless, the `=~` syntax will still be supported
// for backward compatibility and so is here to stay.) Here we make use of Go regexp-library
// functions to write to, and then later interpolate from, a captures array which is stored within
// CST state. (See the `runtime.State` object.)
//
// * "\0" is for a full match; "\1" .. "\9" are for submatch cqptures. E.g.
// if $x is "foobarbaz" and the regex is "foo(.)(..)baz", then "\0" is
// "foobarbaz", "\1" is "b", "\2" is "ar", and "\3".."\9" are "".
//
// * Naming:
//
// o "regexp" and "Regexp" are used for the Go library and its data structure, respectively;
//
// o "regex" is used for regular-expression strings following Miller's idiosyncratic syntax and
// semantics as described above.
//
// ================================================================
package lib
import (
"bytes"
"fmt"
"os"
"regexp"
"strings"
"sync"
)
// captureDetector is used to see if a string literal interpolates previous
// captures (like "\2:\1") or not (like "2:1").
var captureDetector = regexp.MustCompile(`\\[0-9]`)
// captureSplitter is used to precompute an offsets matrix for strings like
// "\2:\1" so they don't need to be recomputed on every record.
var captureSplitter = regexp.MustCompile(`(\\[0-9])`)
// See regexpCompileCached
var regexpCache map[string]*regexp.Regexp
const cacheMaxSize = 1000
var cacheMutex sync.Mutex
// regexpCompileCached keeps a cache of compiled regexes, so that the caller has the flexibility to
// only pass in strings while getting the benefits of compilation avoidance.
//
// Regarding cache size: in nominal use, regexp strings are within Miller DSL code statements, and
// there will be a handful. These will all get re-used after their first application, and the cache
// will remain bounded by the size of the user's DSL code. However, it is possible to have regex
// strings contained within Miller record-field data.
//
// We could solve this by using an LRU cache. However, for simplicity, we limit the number of
// cached compiles, and for any extras that appear during record processing, we simply recompile
// each time.
func regexpCompileCached(s string) (*regexp.Regexp, error) {
if len(regexpCache) > cacheMaxSize {
return regexp.Compile(s)
}
r, err := regexp.Compile(s)
if err == nil {
cacheMutex.Lock()
if regexpCache == nil {
regexpCache = make(map[string]*regexp.Regexp)
}
regexpCache[s] = r
cacheMutex.Unlock()
}
return r, err
}
// CompileMillerRegex wraps Go regex-compile with some Miller-specific syntax which predates the
// port of Miller from C to Go. Miller regexes use a final 'i' to indicate case-insensitivity; Go
// regexes use an initial "(?i)".
//
// (See also mlr.bnf where we specify which things can be backslash-escaped without a syntax error
// at parse time.)
//
// * If the regex_string is of the form a.*b, compiles it case-sensitively.
// * If the regex_string is of the form "a.*b", compiles a.*b case-sensitively.
// * If the regex_string is of the form "a.*b"i, compiles a.*b case-insensitively.
func CompileMillerRegex(regexString string) (*regexp.Regexp, error) {
n := len(regexString)
if n < 2 {
return regexpCompileCached(regexString)
}
// TODO: rethink this. This will strip out things people have entered, e.g. "\"...\"".
// The parser-to-AST will have stripped the outer and we'll strip the inner and the
// user's intent will be lost.
//
// TODO: make separate functions for calling from parser-to-AST (string
// literals) and from verbs (like cut -r or having-fields).
if strings.HasPrefix(regexString, "\"") && strings.HasSuffix(regexString, "\"") {
return regexpCompileCached(regexString[1 : n-1])
}
if strings.HasPrefix(regexString, "/") && strings.HasSuffix(regexString, "/") {
return regexpCompileCached(regexString[1 : n-1])
}
if strings.HasPrefix(regexString, "\"") && strings.HasSuffix(regexString, "\"i") {
return regexpCompileCached("(?i)" + regexString[1:n-2])
}
if strings.HasPrefix(regexString, "/") && strings.HasSuffix(regexString, "/i") {
return regexpCompileCached("(?i)" + regexString[1:n-2])
}
return regexpCompileCached(regexString)
}
// CompileMillerRegexOrDie wraps CompileMillerRegex. Usually in Go we want to
// return a second error argument rather than fataling. However, if there's a
// malformed regex we really cannot continue so it's simpler to just fatal.
func CompileMillerRegexOrDie(regexString string) *regexp.Regexp {
regex, err := CompileMillerRegex(regexString)
if err != nil {
fmt.Fprint(os.Stderr, err)
os.Exit(1)
}
return regex
}
// CompileMillerRegexesOrDie is a convenenience looper over CompileMillerRegexOrDie.
func CompileMillerRegexesOrDie(regexStrings []string) []*regexp.Regexp {
regexes := make([]*regexp.Regexp, len(regexStrings))
for i, regexString := range regexStrings {
regexes[i] = CompileMillerRegexOrDie(regexString)
}
return regexes
}
// In Go as in all languages I'm aware of with a string-split, "a,b,c" splits
// on "," to ["a", "b", "c" and "a" splits to ["a"], both of which are fine --
// but "" splits to [""] when I wish it were []. This function does the latter.
func RegexCompiledSplitString(regex *regexp.Regexp, input string, n int) []string {
if input == "" {
return make([]string, 0)
} else {
return regex.Split(input, n)
}
}
// RegexStringSub implements the sub DSL function.
func RegexStringSub(
input string,
sregex string,
replacement string,
) string {
regex := CompileMillerRegexOrDie(sregex)
_, replacementCaptureMatrix := ReplacementHasCaptures(replacement)
return RegexCompiledSub(input, regex, replacement, replacementCaptureMatrix)
}
// RegexCompiledSub is the same as RegexStringSub but with compiled regex and
// replacement strings.
func RegexCompiledSub(
input string,
regex *regexp.Regexp,
replacement string,
replacementCaptureMatrix [][]int,
) string {
return regexCompiledSubOrGsub(input, regex, replacement, replacementCaptureMatrix, true)
}
// RegexStringGsub implements the `gsub` DSL function.
func RegexStringGsub(
input string,
sregex string,
replacement string,
) string {
regex := CompileMillerRegexOrDie(sregex)
_, replacementCaptureMatrix := ReplacementHasCaptures(replacement)
return regexCompiledSubOrGsub(input, regex, replacement, replacementCaptureMatrix, false)
}
// regexCompiledSubOrGsub is the implementation for `sub`/`gsub` with compilex regex
// and replacement strings.
func regexCompiledSubOrGsub(
input string,
regex *regexp.Regexp,
replacement string,
replacementCaptureMatrix [][]int,
breakOnFirst bool,
) string {
matrix := regex.FindAllSubmatchIndex([]byte(input), -1)
if matrix == nil || len(matrix) == 0 {
return input
}
// Example return value from FindAllSubmatchIndex with input
// "...ab_cde...fg_hij..." and regex "(..)_(...)":
//
// Matrix is [][]int{
// []int{3, 9, 3, 5, 6, 9},
// []int{12, 18, 12, 14, 15, 18},
// }
//
// * 3-9 is for the entire match "ab_cde"
// * 3-5 is for the first capture "ab"
// * 6-9 is for the second capture "cde"
//
// * 12-18 is for the entire match "fg_hij"
// * 12-14 is for the first capture "fg"
// * 15-18 is for the second capture "hij"
var buffer bytes.Buffer
nonMatchStartIndex := 0
for _, row := range matrix {
buffer.WriteString(input[nonMatchStartIndex:row[0]])
// "\0" .. "\9"
captures := make([]string, 10)
di := 0
n := len(row)
for si := 0; si < n && di <= 9; si += 2 {
start := row[si]
end := row[si+1]
if start >= 0 && end >= 0 {
captures[di] = input[start:end]
}
di += 1
}
// If the replacement had no captures, e.g. "xyz", we would insert it
//
// "..." -> "..."
// "ab_cde" -> "xyz" --- here
// "..." -> "..."
// "fg_hij" -> "xyz" --- and here
// "..." -> "..."
//
// using buffer.WriteString(replacement). However, this function exists
// to handle the case when the replacement string has captures like
// "\2:\1", so we need to produce
//
// "..." -> "..."
// "ab_cde" -> "cde:ab" --- here
// "..." -> "..."
// "fg_hij" -> "hij:fg" --- and here
// "..." -> "..."
updatedReplacement := InterpolateCaptures(
replacement,
replacementCaptureMatrix,
captures,
)
buffer.WriteString(updatedReplacement)
nonMatchStartIndex = row[1]
if breakOnFirst {
break
}
}
buffer.WriteString(input[nonMatchStartIndex:])
return buffer.String()
}
// RegexStringMatchSimple is for simple boolean return without any substring captures.
func RegexStringMatchSimple(
input string,
sregex string,
) bool {
regex := CompileMillerRegexOrDie(sregex)
return RegexCompiledMatchSimple(input, regex)
}
// RegexCompiledMatchSimple is for simple boolean return without any substring captures.
func RegexCompiledMatchSimple(
input string,
regex *regexp.Regexp,
) bool {
return regex.Match([]byte(input))
}
// RegexStringMatchWithMapResults implements much of the `strmatchx` DSL function. This returns
// captures via return values. This is distinct from RegexStringMatchWithCaptures which is for the
// `=~` DSL operator.
func RegexStringMatchWithMapResults(
input string,
sregex string,
) (
matches bool,
captures []string,
starts []int,
ends []int,
) {
regex := CompileMillerRegexOrDie(sregex)
return RegexCompiledMatchWithMapResults(input, regex)
}
// RegexCompiledMatchWithMapResults does the work for RegexStringMatchWithMapResults once
// a compiled regexp is available. Array slot 0 is for the full match; slots 1 and up
// are for the capture-matches such as "\([0-9]+\):\([a-z]+\)".
func RegexCompiledMatchWithMapResults(
input string,
regex *regexp.Regexp,
) (bool, []string, []int, []int) {
captures := make([]string, 0, 10)
starts := make([]int, 0, 10)
ends := make([]int, 0, 10)
matrix := regex.FindAllSubmatchIndex([]byte(input), -1)
if matrix == nil || len(matrix) == 0 {
return false, captures, starts, ends
}
// If there are multiple matches -- e.g. input is
//
// "...ab_cde...fg_hij..."
//
// with regex
//
// "(..)_(...)"
//
// -- then we only consider the first match: boolean return value is true
// (the input string matched the regex), and the captures array will map
// slot 1 to "ab" and slot 2 to "cde".
row := matrix[0]
n := len(row)
// Example return value from FindAllSubmatchIndex with input
// "...ab_cde...fg_hij..." and regex "(..)_(...)":
//
// Matrix is [][]int{
// []int{3, 9, 3, 5, 6, 9},
// []int{12, 18, 12, 14, 15, 18},
// }
//
// As noted above we look at only the first row.
//
// * 3-9 is for the entire match "ab_cde"
// * 3-5 is for the first capture "ab"
// * 6-9 is for the second capture "cde"
for si := 0; si < n; si += 2 {
start := row[si]
end := row[si+1]
if start >= 0 && end >= 0 {
captures = append(captures, input[start:end])
starts = append(starts, start+1)
ends = append(ends, end)
} else {
captures = append(captures, "")
starts = append(starts, -1)
ends = append(ends, -1)
}
}
return true, captures, starts, ends
}
// RegexStringMatchWithCaptures implements the =~ DSL operator. The captures are stored in DSL
// state and may be used by a DSL statement after the =~. For example, in
//
// sub($a, "(..)_(...)", "\1:\2")
//
// the replacement string is an argument to sub and therefore the captures are
// confined to the implementation of the sub function. Similarly for gsub. But
// for the match operator, people can do
//
// if ($x =~ "(..)_(...)") {
// ... other lines of code ...
// $y = "\2:\1"
// }
//
// and the =~ callsite doesn't know if captures will be used or not. So,
// RegexStringMatchWithCaptures always returns the captures array. It is stored within the CST
// state.
func RegexStringMatchWithCaptures(
input string,
sregex string,
) (
matches bool,
capturesOneUp []string,
) {
regex := CompileMillerRegexOrDie(sregex)
return RegexCompiledMatchWithCaptures(input, regex)
}
// RegexCompiledMatchWithCaptures is the implementation for the =~ operator. Without
// Miller-style regex captures this would a simple one-line
// regex.MatchString(input). However, we return the captures array for the
// benefit of subsequent references to "\0".."\9".
func RegexCompiledMatchWithCaptures(
input string,
regex *regexp.Regexp,
) (bool, []string) {
matrix := regex.FindAllSubmatchIndex([]byte(input), -1)
if matrix == nil || len(matrix) == 0 {
// Set all captures to ""
return false, make([]string, 10)
}
// "\0" .. "\9"
captures := make([]string, 10)
// If there are multiple matches -- e.g. input is
//
// "...ab_cde...fg_hij..."
//
// with regex
//
// "(..)_(...)"
//
// -- then we only consider the first match: boolean return value is true
// (the input string matched the regex), and the captures array will map
// "\1" to "ab" and "\2" to "cde".
row := matrix[0]
n := len(row)
// Example return value from FindAllSubmatchIndex with input
// "...ab_cde...fg_hij..." and regex "(..)_(...)":
//
// Matrix is [][]int{
// []int{3, 9, 3, 5, 6, 9},
// []int{12, 18, 12, 14, 15, 18},
// }
//
// As noted above we look at only the first row.
//
// * 3-9 is for the entire match "ab_cde"
// * 3-5 is for the first capture "ab"
// * 6-9 is for the second capture "cde"
di := 0
for si := 0; si < n && di <= 9; si += 2 {
start := row[si]
end := row[si+1]
if start >= 0 && end >= 0 {
captures[di] = input[start:end]
}
di += 1
}
return true, captures
}
// MakeEmptyCaptures is for initial CST state at the start of executing the DSL expression for the
// current record. Even if '$x =~ "(..)_(...)" set "\1" and "\2" on the previous record, at start
// of processing for the current record we need to start with a clean slate. This is in support of
// CST state, which `=~` semantics requires.
func MakeEmptyCaptures() []string {
return nil
}
// ReplacementHasCaptures is used by the CST builder to see if string-literal is like "foo bar" or
// "foo \1 bar" -- in the latter case it needs to retain the compiled offsets-matrix information.
// This is in support of CST state, which `=~` semantics requires.
func ReplacementHasCaptures(
replacement string,
) (
hasCaptures bool,
matrix [][]int,
) {
if captureDetector.MatchString(replacement) {
return true, captureSplitter.FindAllSubmatchIndex([]byte(replacement), -1)
} else {
return false, nil
}
}
// InterpolateCaptures example:
//
// * Input $x is "ab_cde"
//
// - DSL expression
// if ($x =~ "(..)_(...)") {
// ... other lines of code ...
// $y = "\2:\1";
// }
//
// * InterpolateCaptures is used on the evaluation of "\2:\1"
//
// * replacementString is "\2:\1"
//
// - replacementMatrix contains precomputed/cached offsets for the "\2" and
// "\1" substrings within "\2:\1"
//
// - captures has slot 0 being "ab_cde" (for "\0"), slot 1 being "ab" (for "\1"),
// slot 2 being "cde" (for "\2"), and slots 3-9 being "".
func InterpolateCaptures(
replacementString string,
replacementMatrix [][]int,
captures []string,
) string {
if replacementMatrix == nil || captures == nil {
return replacementString
}
var buffer bytes.Buffer
nonMatchStartIndex := 0
for _, row := range replacementMatrix {
start := row[0]
buffer.WriteString(replacementString[nonMatchStartIndex:row[0]])
// Map "\0".."\9" to integer index 0..9
index := replacementString[start+1] - '0'
buffer.WriteString(captures[index])
nonMatchStartIndex = row[1]
}
buffer.WriteString(replacementString[nonMatchStartIndex:])
return buffer.String()
}