Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor epsilon transitions #129

Merged
merged 9 commits into from
Sep 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions src/regex.nim
Original file line number Diff line number Diff line change
Expand Up @@ -1143,6 +1143,8 @@ proc toString(
result = "["
result.add($n)
for nn in n.next:
if isEpsilonTransition(pattern.nfa.s[nn]):
continue
result.add(", ")
result.add(pattern.toString(nn, visited))
result.add("]")
Expand Down Expand Up @@ -1368,10 +1370,10 @@ when isMainModule:

doAssert graph(Regex(re2"^a+$")) == """digraph graphname {
0 [label="q0";color=blue];
1 [label="q1";color=black];
2 [label="q2";color=blue];
0 -> 1 [label="a, {^}, i=0"];
1 -> 1 [label="a, i=0"];1 -> 2 [label="{eoe}, {$}, i=1"];
2 [label="q1";color=black];
4 [label="q2";color=blue];
0 -> 2 [label="a, {^}, i=0"];
2 -> 2 [label="a, i=0"];2 -> 4 [label="{eoe}, {$}, i=1"];
}
"""

Expand Down
27 changes: 19 additions & 8 deletions src/regex/dotgraph.nim
Original file line number Diff line number Diff line change
Expand Up @@ -14,23 +14,34 @@ func color(n: Node): string =
func graph*(nfa: Nfa): string =
result = "digraph graphname {\n"
let tab = " "
var qi = 0
for i, n in pairs nfa.s:
if isEpsilonTransition(n):
continue
result.add tab
result.add($i & " [label=\"q" & $i & "\";color=" & n.color & "];")
result.add($i & " [label=\"q" & $qi & "\";color=" & n.color & "];")
result.add '\n'
inc qi
for i, n in pairs nfa.s:
if n.next.len == 0:
continue
if isEpsilonTransition(n):
continue
result.add tab
for i2, n2 in pairs n.next:
var t = ""
if nfa.t.allZ[i][i2] > -1:
for i3, z in pairs nfa.t.z[nfa.t.allZ[i][i2]]:
if i3 > 0: t &= ", "
t &= $z
var t = ""
var ii = 0
for n2 in n.next:
if isEpsilonTransition(nfa.s[n2]):
if t.len > 0:
t &= ", "
t &= $nfa.s[n2]
continue
if t.len > 0:
t = ", {" & t & "}"
let label = ($nfa.s[n2] & t & ", i=" & $i2).replace(r"\", r"\\")
let label = ($nfa.s[n2] & t & ", i=" & $ii).replace(r"\", r"\\")
result.add($i & " -> " & $n2 & " [label=\"" & label & "\"];")
t = ""
inc ii
result.add '\n'
result.add "}\n"

Expand Down
2 changes: 2 additions & 0 deletions src/regex/litopt.nim
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,8 @@ when isMainModule:
result = "["
result.add $n.cp
for nn in n.next:
if isEpsilonTransition(nfa.s[nn]):
continue
result.add ", "
result.add toString(nfa, nn, visited)
result.add "]"
Expand Down
44 changes: 19 additions & 25 deletions src/regex/nfa.nim
Original file line number Diff line number Diff line change
Expand Up @@ -146,10 +146,10 @@ func eNfa*(exp: RpnExp): Enfa {.raises: [RegexError].} =
result.s.add initSkipNode(states)

type
Zclosure = seq[int16]
TeClosure = seq[(int16, Zclosure)]
Etransitions = seq[int16] # xxx transitions
TeClosure = seq[(int16, Etransitions)]

func isTransitionZ(n: Node): bool {.inline.} =
func isEpsilonTransition2(n: Node): bool {.inline.} =
result = case n.kind
of groupKind:
n.isCapturing
Expand All @@ -163,24 +163,24 @@ func teClosure(
eNfa: Enfa,
state: int16,
processing: var seq[int16],
zTransitions: Zclosure
eTransitions: Etransitions
) =
var zTransitionsCurr = zTransitions
if isTransitionZ eNfa.s[state]:
zTransitionsCurr.add state
var eTransitionsCurr = eTransitions
if isEpsilonTransition2 eNfa.s[state]:
eTransitionsCurr.add state
if eNfa.s[state].kind in matchableKind + {reEOE}:
result.add (state, zTransitionsCurr)
result.add (state, eTransitionsCurr)
return
for i, s in pairs eNfa.s[state].next:
# Enter loops only once. "a", re"(a*)*" -> ["a", ""]
if eNfa.s[state].kind in repetitionKind:
if s notin processing or i == int(eNfa.s[state].isGreedy):
processing.add s
teClosure(result, eNfa, s, processing, zTransitionsCurr)
teClosure(result, eNfa, s, processing, eTransitionsCurr)
discard processing.pop()
# else skip loop
else:
teClosure(result, eNfa, s, processing, zTransitionsCurr)
teClosure(result, eNfa, s, processing, eTransitionsCurr)

func teClosure(
result: var TeClosure,
Expand All @@ -189,9 +189,9 @@ func teClosure(
processing: var seq[int16]
) =
doAssert processing.len == 0
var zclosure: Zclosure
var eTransitions: Etransitions
for s in eNfa.s[state].next:
teClosure(result, eNfa, s, processing, zclosure)
teClosure(result, eNfa, s, processing, eTransitions)

when (NimMajor, NimMinor, NimPatch) < (1,4,0) and not declared(IndexDefect):
# avoids a warning
Expand All @@ -206,15 +206,13 @@ func eRemoval*(eNfa: Enfa): Nfa {.raises: [].} =
#echo eNfa
result.s = newSeq[Node](eNfa.s.len)
result.s.setLen 0
result.t.allZ.setLen eNfa.s.len
var statesMap = newSeq[int16](eNfa.s.len)
for i in 0 .. statesMap.len-1:
statesMap[i] = -1
let start = int16(eNfa.s.len-1)
result.s.add eNfa.s[start]
statesMap[start] = 0'i16
var closure: TeClosure
var zc: seq[Node]
var qw = initDeque[int16](2)
qw.addFirst start
var qu: set[int16]
Expand All @@ -228,25 +226,21 @@ func eRemoval*(eNfa: Enfa): Nfa {.raises: [].} =
doAssert false
closure.setLen 0
teClosure(closure, eNfa, qa, processing)
doAssert statesMap[qa] > -1
result.s[statesMap[qa]].next.setLen 0
for qb, zclosure in closure.items:
for qb, eTransitions in closure.items:
for eti in eTransitions:
if statesMap[eti] == -1:
result.s.add eNfa.s[eti]
statesMap[eti] = result.s.len.int16-1
result.s[statesMap[qa]].next.add statesMap[eti]
if statesMap[qb] == -1:
result.s.add eNfa.s[qb]
statesMap[qb] = result.s.len.int16-1
doAssert statesMap[qb] > -1
doAssert statesMap[qa] > -1
result.s[statesMap[qa]].next.add statesMap[qb]
result.t.allZ[statesMap[qa]].add -1'i16
zc.setLen 0
for z in zclosure:
zc.add eNfa.s[z]
if zc.len > 0:
result.t.z.add zc
result.t.allZ[statesMap[qa]][^1] = int16(result.t.z.len-1)
if qb notin qu:
qu.incl qb
qw.addFirst qb
result.t.allZ.setLen result.s.len

func reverse(eNfa: Enfa): Enfa =
template state0: untyped = int16(eNfa.s.len-1)
Expand Down
32 changes: 16 additions & 16 deletions src/regex/nfafindall.nim
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,6 @@ func submatch(
i: int,
cPrev, c: int32
) {.inline.} =
template tns: untyped = regex.nfa.t
template nfa: untyped = regex.nfa.s
template smA: untyped = ms.a
template smB: untyped = ms.b
Expand All @@ -118,39 +117,39 @@ func submatch(
template capt: untyped = ms.a[smi].ci
template bounds: untyped = ms.a[smi].bounds
template look: untyped = ms.look
template nt: untyped = nfa[n].next[nti]
template ntn: untyped = nfa[nt]
smB.clear()
var captx: int32
var matched = true
var eoeFound = false
var smi = 0
while smi < smA.len:
for nti, nt in nfa[n].next.pairs:
if smB.hasState nt:
continue
if nfa[nt].kind != reEoe and not match(nfa[nt], c.Rune):
continue
var nti = 0
while nti <= nfa[n].next.len-1:
matched = true
captx = capt
if tns.allZ[n][nti] > -1:
for z in tns.z[tns.allZ[n][nti]]:
if not matched:
break
case z.kind
while isEpsilonTransition(ntn):
if matched:
case ntn.kind
of groupKind:
capts.add CaptNode(
parent: captx,
bound: i,
idx: z.idx)
idx: ntn.idx)
captx = (capts.len-1).int32
of assertionKind - lookaroundKind:
matched = match(z, cPrev.Rune, c.Rune)
matched = match(ntn, cPrev.Rune, c.Rune)
of lookaroundKind:
lookAroundTpl()
else:
assert false
doAssert false
discard
if matched:
if nfa[nt].kind == reEoe:
inc nti
if matched and
not smB.hasState(nt) and
(ntn.match(c.Rune) or ntn.kind == reEoe):
if ntn.kind == reEoe:
#debugEcho "eoe ", bounds, " ", ms.m
ms.m.add (captx, bounds.a .. i-1)
smA.clear()
Expand All @@ -160,6 +159,7 @@ func submatch(
smi = -1
break
smB.add (nt, captx, bounds.a .. i-1)
inc nti
inc smi
swap smA, smB

Expand Down
34 changes: 17 additions & 17 deletions src/regex/nfafindall2.nim
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,6 @@ func submatch(
i: int,
cPrev, c: int32
) {.inline.} =
template tns: untyped = regex.nfa.t
template nfa: untyped = regex.nfa.s
template smA: untyped = ms.a
template smB: untyped = ms.b
Expand All @@ -155,6 +154,8 @@ func submatch(
template capt: untyped = ms.a[smi].ci
template bounds: untyped = ms.a[smi].bounds
template look: untyped = ms.look
template nt: untyped = nfa[n].next[nti]
template ntn: untyped = nfa[nt]
smB.clear()
var captx: int32
var matched = true
Expand All @@ -163,37 +164,35 @@ func submatch(
while smi < smA.len:
if capt != -1:
capts.keepAlive capt
for nti, nt in nfa[n].next.pairs:
if smB.hasState nt:
continue
if nfa[nt].kind != reEoe and not match(nfa[nt], c.Rune):
continue
var nti = 0
while nti <= nfa[n].next.len-1:
matched = true
captx = capt
if tns.allZ[n][nti] > -1:
for z in tns.z[tns.allZ[n][nti]]:
if not matched:
break
case z.kind
while isEpsilonTransition(ntn):
if matched:
case ntn.kind
of reGroupStart:
captx = capts.diverge captx
capts[captx, z.idx].a = i
capts[captx, ntn.idx].a = i
of reGroupEnd:
captx = capts.diverge captx
capts[captx, z.idx].b = i-1
capts[captx, ntn.idx].b = i-1
of assertionKind - lookaroundKind:
matched = match(z, cPrev.Rune, c.Rune)
matched = match(ntn, cPrev.Rune, c.Rune)
of lookaroundKind:
let freezed = capts.freeze()
lookAroundTpl()
capts.unfreeze freezed
if captx != -1:
capts.keepAlive captx
else:
assert false
doAssert false
discard
if matched:
if nfa[nt].kind == reEoe:
inc nti
if matched and
not smB.hasState(nt) and
(ntn.match(c.Rune) or ntn.kind == reEoe):
if ntn.kind == reEoe:
#debugEcho "eoe ", bounds, " ", ms.m
ms.add (captx, bounds.a .. i-1)
smA.clear()
Expand All @@ -203,6 +202,7 @@ func submatch(
smi = -1
break
smB.add (nt, captx, bounds.a .. i-1)
inc nti
inc smi
swap smA, smB
capts.recycle()
Expand Down
20 changes: 17 additions & 3 deletions src/regex/nfamacro.nim
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import std/macros
import std/unicode
import std/tables
import std/sets
import std/algorithm

import pkg/unicodedb/properties
import pkg/unicodedb/types as utypes
Expand Down Expand Up @@ -268,6 +269,13 @@ func genLookaroundMatch(
`lookaroundStmt`
removeLast `smL`

func getEpsilonTransitions(nfa: Nfa, n: Node, nti: int): seq[int] =
for i in countdown(nti-1, 0):
if not isEpsilonTransition(nfa.s[n.next[i]]):
break
result.add n.next[i]
result.reverse()

func genMatchedBody(
smB, ntLit, capt, bounds, matched, captx,
capts, charIdx, cPrev, c, text: NimNode,
Expand All @@ -276,19 +284,21 @@ func genMatchedBody(
look: Lookaround,
flags: set[MatchFlag]
): NimNode =
template t: untyped = nfa.t
template n: untyped = nfa.s[i]
template z: untyped = nfa.s[eti]
let bounds2 = if mfBwMatch in flags:
quote do: `charIdx` .. `bounds`.b
else:
quote do: `bounds`.a .. `charIdx`-1
if t.allZ[i][nti] == -1'i16:
let eTransitions = getEpsilonTransitions(nfa, n, nti)
if eTransitions.len == 0:
return quote do:
add(`smB`, (`ntLit`, `capt`, `bounds2`))
var matchedBody: seq[NimNode]
matchedBody.add quote do:
`matched` = true
`captx` = `capt`
for z in t.z[t.allZ[i][nti]]:
for eti in eTransitions:
case z.kind
of groupKind:
let zIdx = newLit z.idx
Expand Down Expand Up @@ -347,10 +357,14 @@ func genNextState(
for i in 0 .. s.len-1:
if s[i].kind == reEoe:
continue
if isEpsilonTransition(s[i]):
continue
var branchBodyN: seq[NimNode]
for nti, nt in s[i].next.pairs:
if eoeOnly and s[nt].kind != reEoe:
continue
if isEpsilonTransition(s[nt]):
continue
let matchCond = case s[nt].kind
of reEoe:
quote do: `c` == -1'i32
Expand Down
Loading
Loading