Skip to content

Commit

Permalink
Optimize search for start-anchored regexes (#683)
Browse files Browse the repository at this point in the history
When a regex is anchored to the start of a subject, there's no need
to search throughout a string for the pattern when searching for the
first match: a prefix match is sufficient.

This adds a regex compilation-time check about whether a match can
only be found at the start of a subject, and then uses that to
choose whether to defer to `prefixMatch` from within `firstMatch`.
  • Loading branch information
natecook1000 authored Jul 20, 2023
1 parent 2aababb commit cc96bb5
Show file tree
Hide file tree
Showing 8 changed files with 209 additions and 4 deletions.
2 changes: 1 addition & 1 deletion Sources/RegexBenchmark/Suite/NotFound.swift
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ extension BenchmarkRunner {
baseName: "AnchoredNotFound",
regex: "^ +a",
input: input,
isWhole: true)
includeFirst: true)
anchoredNotFound.register(&self)
}
}
1 change: 1 addition & 0 deletions Sources/_StringProcessing/ByteCodeGen.swift
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ extension Compiler.ByteCodeGen {
// The whole match (`.0` element of output) is equivalent to an implicit
// capture over the entire regex.
try emitNode(.capture(name: nil, reference: nil, root))
builder.canOnlyMatchAtStart = root.canOnlyMatchAtStart()
builder.buildAccept()
return try builder.assemble()
}
Expand Down
6 changes: 5 additions & 1 deletion Sources/_StringProcessing/Engine/MEBuilder.swift
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ extension MEProgram {
var captureList = CaptureList()
var initialOptions = MatchingOptions()

// Starting constraint
var canOnlyMatchAtStart = false

// Symbolic reference resolution
var unresolvedReferences: [ReferenceID: [InstructionAddress]] = [:]
var referencedCaptureOffsets: [ReferenceID: Int] = [:]
Expand Down Expand Up @@ -404,7 +407,8 @@ extension MEProgram.Builder {
enableMetrics: enableMetrics,
captureList: captureList,
referencedCaptureOffsets: referencedCaptureOffsets,
initialOptions: initialOptions)
initialOptions: initialOptions,
canOnlyMatchAtStart: canOnlyMatchAtStart)
}

mutating func reset() { self = Self() }
Expand Down
1 change: 1 addition & 0 deletions Sources/_StringProcessing/Engine/MEProgram.swift
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ struct MEProgram {
let referencedCaptureOffsets: [ReferenceID: Int]

var initialOptions: MatchingOptions
var canOnlyMatchAtStart: Bool
}

extension MEProgram: CustomStringConvertible {
Expand Down
110 changes: 110 additions & 0 deletions Sources/_StringProcessing/Regex/DSLTree.swift
Original file line number Diff line number Diff line change
Expand Up @@ -711,6 +711,105 @@ extension DSLTree.Node {
}
}

extension DSLTree.Node {
/// Implementation for `canOnlyMatchAtStart`, which maintains the option
/// state.
///
/// For a given specific node, this method can return one of three values:
///
/// - `true`: This node is guaranteed to match only at the start of a subject.
/// - `false`: This node can match anywhere in the subject.
/// - `nil`: This node is inconclusive about where it can match.
///
/// In particular, non-required groups and option-setting groups are
/// inconclusive about where they can match.
private func _canOnlyMatchAtStartImpl(_ options: inout MatchingOptions) -> Bool? {
switch self {
// Defining cases
case .atom(.assertion(.startOfSubject)):
return true
case .atom(.assertion(.caretAnchor)):
return !options.anchorsMatchNewlines

// Changing options doesn't determine `true`/`false`.
case .atom(.changeMatchingOptions(let sequence)):
options.apply(sequence.ast)
return nil

// Any other atom or consuming node returns `false`.
case .atom, .customCharacterClass, .quotedLiteral:
return false

// Trivia/empty have no effect.
case .trivia, .empty:
return nil

// In an alternation, all of its children must match only at start.
case .orderedChoice(let children):
return children.allSatisfy { $0._canOnlyMatchAtStartImpl(&options) == true }

// In a concatenation, the first definitive child provides the answer.
case .concatenation(let children):
for child in children {
if let result = child._canOnlyMatchAtStartImpl(&options) {
return result
}
}
return false

// Groups (and other parent nodes) defer to the child.
case .nonCapturingGroup(let kind, let child):
options.beginScope()
defer { options.endScope() }
if case .changeMatchingOptions(let sequence) = kind.ast {
options.apply(sequence)
}
return child._canOnlyMatchAtStartImpl(&options)
case .capture(_, _, let child, _):
options.beginScope()
defer { options.endScope() }
return child._canOnlyMatchAtStartImpl(&options)
case .ignoreCapturesInTypedOutput(let child),
.convertedRegexLiteral(let child, _):
return child._canOnlyMatchAtStartImpl(&options)

// A quantification that doesn't require its child to exist can still
// allow a start-only match. (e.g. `/(foo)?^bar/`)
case .quantification(let amount, _, let child):
return amount.requiresAtLeastOne
? child._canOnlyMatchAtStartImpl(&options)
: nil

// For conditional nodes, both sides must require matching at start.
case .conditional(_, let child1, let child2):
return child1._canOnlyMatchAtStartImpl(&options) == true
&& child2._canOnlyMatchAtStartImpl(&options) == true

// Extended behavior isn't known, so we return `false` for safety.
case .consumer, .matcher, .characterPredicate, .absentFunction:
return false
}
}

/// Returns a Boolean value indicating whether the regex with this node as
/// the root can _only_ match at the start of a subject.
///
/// For example, these regexes can only match at the start of a subject:
///
/// - `/^foo/`
/// - `/(^foo|^bar)/` (both sides of the alternation start with `^`)
///
/// These can match other places in a subject:
///
/// - `/(^foo)?bar/` (`^` is in an optional group)
/// - `/(^foo|bar)/` (only one side of the alternation starts with `^`)
/// - `/(?m)^foo/` (`^` means "the start of a line" due to `(?m)`)
internal func canOnlyMatchAtStart() -> Bool {
var options = MatchingOptions()
return _canOnlyMatchAtStartImpl(&options) ?? false
}
}

// MARK: AST wrapper types
//
// These wrapper types are required because even @_spi-marked public APIs can't
Expand Down Expand Up @@ -818,6 +917,17 @@ extension DSLTree {
public static func range(_ lower: Int, _ upper: Int) -> Self {
.init(ast: .range(.init(lower, at: .fake), .init(upper, at: .fake)))
}

internal var requiresAtLeastOne: Bool {
switch ast {
case .zeroOrOne, .zeroOrMore, .upToN:
return false
case .oneOrMore:
return true
case .exactly(let num), .nOrMore(let num), .range(let num, _):
return num.value.map { $0 > 0 } ?? false
}
}
}

@_spi(RegexBuilder)
Expand Down
4 changes: 3 additions & 1 deletion Sources/_StringProcessing/Regex/Match.swift
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,9 @@ extension Regex {
_ input: String,
in subjectBounds: Range<String.Index>
) throws -> Regex<Output>.Match? {
try _firstMatch(input, subjectBounds: subjectBounds, searchBounds: subjectBounds)
try regex.program.loweredProgram.canOnlyMatchAtStart
? _match(input, in: subjectBounds, mode: .partialFromFront)
: _firstMatch(input, subjectBounds: subjectBounds, searchBounds: subjectBounds)
}

func _firstMatch(
Expand Down
51 changes: 50 additions & 1 deletion Tests/RegexBuilderTests/RegexDSLTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
//===----------------------------------------------------------------------===//

import XCTest
import _StringProcessing
@testable import _StringProcessing
import RegexBuilder
import TestSupport

Expand Down Expand Up @@ -973,6 +973,55 @@ class RegexDSLTests: XCTestCase {
}
}

func testCanOnlyMatchAtStart() throws {
func expectCanOnlyMatchAtStart(
_ expectation: Bool,
file: StaticString = #file, line: UInt = #line,
@RegexComponentBuilder _ content: () -> some RegexComponent
) {
let regex = content().regex
XCTAssertEqual(regex.program.loweredProgram.canOnlyMatchAtStart, expectation, file: file, line: line)
}

expectCanOnlyMatchAtStart(true) {
Anchor.startOfSubject
"foo"
}
expectCanOnlyMatchAtStart(false) {
"foo"
}
expectCanOnlyMatchAtStart(true) {
Optionally { "foo" }
Anchor.startOfSubject
"bar"
}

expectCanOnlyMatchAtStart(true) {
ChoiceOf {
Regex {
Anchor.startOfSubject
"foo"
}
Regex {
Anchor.startOfSubject
"bar"
}
}
}
expectCanOnlyMatchAtStart(false) {
ChoiceOf {
Regex {
Anchor.startOfSubject
"foo"
}
Regex {
Anchor.startOfLine
"bar"
}
}
}
}

func testNestedGroups() throws {
return;

Expand Down
38 changes: 38 additions & 0 deletions Tests/RegexTests/CompileTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -484,4 +484,42 @@ extension RegexTests {
expectProgram(for: #"(a+)*"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition])
expectProgram(for: #"(a{1,})*"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition])
}

func testCanOnlyMatchAtStart() throws {
func expectCanOnlyMatchAtStart(
_ regexStr: String,
_ expectTrue: Bool,
file: StaticString = #file,
line: UInt = #line
) throws {
let regex = try Regex(regexStr)
XCTAssertEqual(
regex.program.loweredProgram.canOnlyMatchAtStart, expectTrue,
file: file, line: line)
}

try expectCanOnlyMatchAtStart("^foo", true) // anchor
try expectCanOnlyMatchAtStart("\\Afoo", true) // more specific anchor
try expectCanOnlyMatchAtStart("foo", false) // no anchor

try expectCanOnlyMatchAtStart("(?i)^foo", true) // unrelated option
try expectCanOnlyMatchAtStart("(?m)^foo", false) // anchors match newlines
try expectCanOnlyMatchAtStart("(?i:^foo)", true) // unrelated option
try expectCanOnlyMatchAtStart("(?m:^foo)", false) // anchors match newlines

try expectCanOnlyMatchAtStart("(^foo|bar)", false) // one side of alternation
try expectCanOnlyMatchAtStart("(foo|^bar)", false) // other side of alternation
try expectCanOnlyMatchAtStart("(^foo|^bar)", true) // both sides of alternation

// Test quantifiers that include the anchor
try expectCanOnlyMatchAtStart("(^foo)?bar", false)
try expectCanOnlyMatchAtStart("(^foo)*bar", false)
try expectCanOnlyMatchAtStart("(^foo)+bar", true)
try expectCanOnlyMatchAtStart("(?:^foo)+bar", true)

// Test quantifiers before the anchor
try expectCanOnlyMatchAtStart("(foo)?^bar", true) // The initial group must match ""
try expectCanOnlyMatchAtStart("(?:foo)?^bar", true)
try expectCanOnlyMatchAtStart("(foo)+^bar", false) // This can't actually match anywhere
}
}

0 comments on commit cc96bb5

Please sign in to comment.