From cc96bb5ca24e97ef7d07dd5d7a2a4a8b62d5406a Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Thu, 20 Jul 2023 09:59:13 -0500 Subject: [PATCH] Optimize search for start-anchored regexes (#683) When a regex is anchored to the start of a subject, there's no need to search throughout a string for the pattern when searching for the first match: a prefix match is sufficient. This adds a regex compilation-time check about whether a match can only be found at the start of a subject, and then uses that to choose whether to defer to `prefixMatch` from within `firstMatch`. --- Sources/RegexBenchmark/Suite/NotFound.swift | 2 +- Sources/_StringProcessing/ByteCodeGen.swift | 1 + .../_StringProcessing/Engine/MEBuilder.swift | 6 +- .../_StringProcessing/Engine/MEProgram.swift | 1 + Sources/_StringProcessing/Regex/DSLTree.swift | 110 ++++++++++++++++++ Sources/_StringProcessing/Regex/Match.swift | 4 +- Tests/RegexBuilderTests/RegexDSLTests.swift | 51 +++++++- Tests/RegexTests/CompileTests.swift | 38 ++++++ 8 files changed, 209 insertions(+), 4 deletions(-) diff --git a/Sources/RegexBenchmark/Suite/NotFound.swift b/Sources/RegexBenchmark/Suite/NotFound.swift index a1ed7eae0..be2e67e79 100644 --- a/Sources/RegexBenchmark/Suite/NotFound.swift +++ b/Sources/RegexBenchmark/Suite/NotFound.swift @@ -13,7 +13,7 @@ extension BenchmarkRunner { baseName: "AnchoredNotFound", regex: "^ +a", input: input, - isWhole: true) + includeFirst: true) anchoredNotFound.register(&self) } } diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 00ce0d5f6..cb2e9ed04 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -46,6 +46,7 @@ extension Compiler.ByteCodeGen { // The whole match (`.0` element of output) is equivalent to an implicit // capture over the entire regex. try emitNode(.capture(name: nil, reference: nil, root)) + builder.canOnlyMatchAtStart = root.canOnlyMatchAtStart() builder.buildAccept() return try builder.assemble() } diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index 93801aeec..e26a00fb1 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -43,6 +43,9 @@ extension MEProgram { var captureList = CaptureList() var initialOptions = MatchingOptions() + // Starting constraint + var canOnlyMatchAtStart = false + // Symbolic reference resolution var unresolvedReferences: [ReferenceID: [InstructionAddress]] = [:] var referencedCaptureOffsets: [ReferenceID: Int] = [:] @@ -404,7 +407,8 @@ extension MEProgram.Builder { enableMetrics: enableMetrics, captureList: captureList, referencedCaptureOffsets: referencedCaptureOffsets, - initialOptions: initialOptions) + initialOptions: initialOptions, + canOnlyMatchAtStart: canOnlyMatchAtStart) } mutating func reset() { self = Self() } diff --git a/Sources/_StringProcessing/Engine/MEProgram.swift b/Sources/_StringProcessing/Engine/MEProgram.swift index 67f5a8bc9..3107d5ef7 100644 --- a/Sources/_StringProcessing/Engine/MEProgram.swift +++ b/Sources/_StringProcessing/Engine/MEProgram.swift @@ -38,6 +38,7 @@ struct MEProgram { let referencedCaptureOffsets: [ReferenceID: Int] var initialOptions: MatchingOptions + var canOnlyMatchAtStart: Bool } extension MEProgram: CustomStringConvertible { diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index b784e2382..f24b87d09 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -711,6 +711,105 @@ extension DSLTree.Node { } } +extension DSLTree.Node { + /// Implementation for `canOnlyMatchAtStart`, which maintains the option + /// state. + /// + /// For a given specific node, this method can return one of three values: + /// + /// - `true`: This node is guaranteed to match only at the start of a subject. + /// - `false`: This node can match anywhere in the subject. + /// - `nil`: This node is inconclusive about where it can match. + /// + /// In particular, non-required groups and option-setting groups are + /// inconclusive about where they can match. + private func _canOnlyMatchAtStartImpl(_ options: inout MatchingOptions) -> Bool? { + switch self { + // Defining cases + case .atom(.assertion(.startOfSubject)): + return true + case .atom(.assertion(.caretAnchor)): + return !options.anchorsMatchNewlines + + // Changing options doesn't determine `true`/`false`. + case .atom(.changeMatchingOptions(let sequence)): + options.apply(sequence.ast) + return nil + + // Any other atom or consuming node returns `false`. + case .atom, .customCharacterClass, .quotedLiteral: + return false + + // Trivia/empty have no effect. + case .trivia, .empty: + return nil + + // In an alternation, all of its children must match only at start. + case .orderedChoice(let children): + return children.allSatisfy { $0._canOnlyMatchAtStartImpl(&options) == true } + + // In a concatenation, the first definitive child provides the answer. + case .concatenation(let children): + for child in children { + if let result = child._canOnlyMatchAtStartImpl(&options) { + return result + } + } + return false + + // Groups (and other parent nodes) defer to the child. + case .nonCapturingGroup(let kind, let child): + options.beginScope() + defer { options.endScope() } + if case .changeMatchingOptions(let sequence) = kind.ast { + options.apply(sequence) + } + return child._canOnlyMatchAtStartImpl(&options) + case .capture(_, _, let child, _): + options.beginScope() + defer { options.endScope() } + return child._canOnlyMatchAtStartImpl(&options) + case .ignoreCapturesInTypedOutput(let child), + .convertedRegexLiteral(let child, _): + return child._canOnlyMatchAtStartImpl(&options) + + // A quantification that doesn't require its child to exist can still + // allow a start-only match. (e.g. `/(foo)?^bar/`) + case .quantification(let amount, _, let child): + return amount.requiresAtLeastOne + ? child._canOnlyMatchAtStartImpl(&options) + : nil + + // For conditional nodes, both sides must require matching at start. + case .conditional(_, let child1, let child2): + return child1._canOnlyMatchAtStartImpl(&options) == true + && child2._canOnlyMatchAtStartImpl(&options) == true + + // Extended behavior isn't known, so we return `false` for safety. + case .consumer, .matcher, .characterPredicate, .absentFunction: + return false + } + } + + /// Returns a Boolean value indicating whether the regex with this node as + /// the root can _only_ match at the start of a subject. + /// + /// For example, these regexes can only match at the start of a subject: + /// + /// - `/^foo/` + /// - `/(^foo|^bar)/` (both sides of the alternation start with `^`) + /// + /// These can match other places in a subject: + /// + /// - `/(^foo)?bar/` (`^` is in an optional group) + /// - `/(^foo|bar)/` (only one side of the alternation starts with `^`) + /// - `/(?m)^foo/` (`^` means "the start of a line" due to `(?m)`) + internal func canOnlyMatchAtStart() -> Bool { + var options = MatchingOptions() + return _canOnlyMatchAtStartImpl(&options) ?? false + } +} + // MARK: AST wrapper types // // These wrapper types are required because even @_spi-marked public APIs can't @@ -818,6 +917,17 @@ extension DSLTree { public static func range(_ lower: Int, _ upper: Int) -> Self { .init(ast: .range(.init(lower, at: .fake), .init(upper, at: .fake))) } + + internal var requiresAtLeastOne: Bool { + switch ast { + case .zeroOrOne, .zeroOrMore, .upToN: + return false + case .oneOrMore: + return true + case .exactly(let num), .nOrMore(let num), .range(let num, _): + return num.value.map { $0 > 0 } ?? false + } + } } @_spi(RegexBuilder) diff --git a/Sources/_StringProcessing/Regex/Match.swift b/Sources/_StringProcessing/Regex/Match.swift index e5e899ced..f13b01a85 100644 --- a/Sources/_StringProcessing/Regex/Match.swift +++ b/Sources/_StringProcessing/Regex/Match.swift @@ -273,7 +273,9 @@ extension Regex { _ input: String, in subjectBounds: Range ) throws -> Regex.Match? { - try _firstMatch(input, subjectBounds: subjectBounds, searchBounds: subjectBounds) + try regex.program.loweredProgram.canOnlyMatchAtStart + ? _match(input, in: subjectBounds, mode: .partialFromFront) + : _firstMatch(input, subjectBounds: subjectBounds, searchBounds: subjectBounds) } func _firstMatch( diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index 19ac675dc..06b6ff1a3 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -10,7 +10,7 @@ //===----------------------------------------------------------------------===// import XCTest -import _StringProcessing +@testable import _StringProcessing import RegexBuilder import TestSupport @@ -973,6 +973,55 @@ class RegexDSLTests: XCTestCase { } } + func testCanOnlyMatchAtStart() throws { + func expectCanOnlyMatchAtStart( + _ expectation: Bool, + file: StaticString = #file, line: UInt = #line, + @RegexComponentBuilder _ content: () -> some RegexComponent + ) { + let regex = content().regex + XCTAssertEqual(regex.program.loweredProgram.canOnlyMatchAtStart, expectation, file: file, line: line) + } + + expectCanOnlyMatchAtStart(true) { + Anchor.startOfSubject + "foo" + } + expectCanOnlyMatchAtStart(false) { + "foo" + } + expectCanOnlyMatchAtStart(true) { + Optionally { "foo" } + Anchor.startOfSubject + "bar" + } + + expectCanOnlyMatchAtStart(true) { + ChoiceOf { + Regex { + Anchor.startOfSubject + "foo" + } + Regex { + Anchor.startOfSubject + "bar" + } + } + } + expectCanOnlyMatchAtStart(false) { + ChoiceOf { + Regex { + Anchor.startOfSubject + "foo" + } + Regex { + Anchor.startOfLine + "bar" + } + } + } + } + func testNestedGroups() throws { return; diff --git a/Tests/RegexTests/CompileTests.swift b/Tests/RegexTests/CompileTests.swift index 752921e19..aafe752bc 100644 --- a/Tests/RegexTests/CompileTests.swift +++ b/Tests/RegexTests/CompileTests.swift @@ -484,4 +484,42 @@ extension RegexTests { expectProgram(for: #"(a+)*"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition]) expectProgram(for: #"(a{1,})*"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition]) } + + func testCanOnlyMatchAtStart() throws { + func expectCanOnlyMatchAtStart( + _ regexStr: String, + _ expectTrue: Bool, + file: StaticString = #file, + line: UInt = #line + ) throws { + let regex = try Regex(regexStr) + XCTAssertEqual( + regex.program.loweredProgram.canOnlyMatchAtStart, expectTrue, + file: file, line: line) + } + + try expectCanOnlyMatchAtStart("^foo", true) // anchor + try expectCanOnlyMatchAtStart("\\Afoo", true) // more specific anchor + try expectCanOnlyMatchAtStart("foo", false) // no anchor + + try expectCanOnlyMatchAtStart("(?i)^foo", true) // unrelated option + try expectCanOnlyMatchAtStart("(?m)^foo", false) // anchors match newlines + try expectCanOnlyMatchAtStart("(?i:^foo)", true) // unrelated option + try expectCanOnlyMatchAtStart("(?m:^foo)", false) // anchors match newlines + + try expectCanOnlyMatchAtStart("(^foo|bar)", false) // one side of alternation + try expectCanOnlyMatchAtStart("(foo|^bar)", false) // other side of alternation + try expectCanOnlyMatchAtStart("(^foo|^bar)", true) // both sides of alternation + + // Test quantifiers that include the anchor + try expectCanOnlyMatchAtStart("(^foo)?bar", false) + try expectCanOnlyMatchAtStart("(^foo)*bar", false) + try expectCanOnlyMatchAtStart("(^foo)+bar", true) + try expectCanOnlyMatchAtStart("(?:^foo)+bar", true) + + // Test quantifiers before the anchor + try expectCanOnlyMatchAtStart("(foo)?^bar", true) // The initial group must match "" + try expectCanOnlyMatchAtStart("(?:foo)?^bar", true) + try expectCanOnlyMatchAtStart("(foo)+^bar", false) // This can't actually match anywhere + } }