Skip to content

Commit

Permalink
Adds SPI for a NSRE compatibility mode option (#698) (#702)
Browse files Browse the repository at this point in the history
NSRegularExpression matches at the Unicode scalar level, but also
matches `\r\n` sequences with a single `.` when single-line mode is
enabled. This adds a `_nsreCompatibility` property that enables both
of those behaviors, and implements support for the special case
handling of `.`.
  • Loading branch information
natecook1000 authored Dec 6, 2023
1 parent 355027f commit d56f16a
Show file tree
Hide file tree
Showing 6 changed files with 71 additions and 6 deletions.
3 changes: 3 additions & 0 deletions Sources/_RegexParser/Regex/AST/MatchingOptions.swift
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@ extension AST {

// Swift-only default possessive quantifier
case possessiveByDefault // t.b.d.

// NSRegularExpression compatibility special-case
case nsreCompatibleDot // no AST representation
}

public var kind: Kind
Expand Down
3 changes: 2 additions & 1 deletion Sources/_RegexParser/Regex/Parse/Sema.swift
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,8 @@ extension RegexValidator {

case .caseInsensitive, .possessiveByDefault, .reluctantByDefault,
.singleLine, .multiline, .namedCapturesOnly, .extended, .extraExtended,
.asciiOnlyDigit, .asciiOnlyWord, .asciiOnlySpace, .asciiOnlyPOSIXProps:
.asciiOnlyDigit, .asciiOnlyWord, .asciiOnlySpace, .asciiOnlyPOSIXProps,
.nsreCompatibleDot:
break
}
}
Expand Down
14 changes: 10 additions & 4 deletions Sources/_StringProcessing/ByteCodeGen.swift
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ fileprivate extension Compiler.ByteCodeGen {
emitAnyNonNewline()

case .dot:
emitDot()
try emitDot()

case let .char(c):
emitCharacter(c)
Expand Down Expand Up @@ -238,9 +238,15 @@ fileprivate extension Compiler.ByteCodeGen {
}
}

mutating func emitDot() {
mutating func emitDot() throws {
if options.dotMatchesNewline {
emitAny()
if options.usesNSRECompatibleDot {
try emitAlternation([
.atom(.characterClass(.newlineSequence)),
.atom(.anyNonNewline)])
} else {
emitAny()
}
} else {
emitAnyNonNewline()
}
Expand Down Expand Up @@ -964,7 +970,7 @@ fileprivate extension Compiler.ByteCodeGen {
case let .customCharacterClass(ccc):
if ccc.containsDot {
if !ccc.isInverted {
emitDot()
try emitDot()
} else {
throw Unsupported("Inverted any")
}
Expand Down
7 changes: 7 additions & 0 deletions Sources/_StringProcessing/MatchingOptions.swift
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,10 @@ extension MatchingOptions {
? .graphemeCluster
: .unicodeScalar
}

var usesNSRECompatibleDot: Bool {
stack.last!.contains(.nsreCompatibleDot)
}
}

// MARK: - Implementation
Expand All @@ -141,6 +145,7 @@ extension MatchingOptions {
// Not available via regex literal flags
case transparentBounds
case withoutAnchoringBounds
case nsreCompatibleDot

// Oniguruma options
case asciiOnlyDigit
Expand Down Expand Up @@ -197,6 +202,8 @@ extension MatchingOptions {
self = .byteSemantics
case .possessiveByDefault:
self = .possessiveByDefault
case .nsreCompatibleDot:
self = .nsreCompatibleDot

// Whitespace options are only relevant during parsing, not compilation.
case .extended, .extraExtended:
Expand Down
12 changes: 12 additions & 0 deletions Sources/_StringProcessing/Regex/Options.swift
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,18 @@ extension Regex {
return wrapInOption(.unicodeScalarSemantics, addingIf: true)
}
}

/// Returns a regular expression that uses an NSRegularExpression
/// compatibility mode.
///
/// This mode includes using Unicode scalar semantics and treating a `dot`
/// as matching newline sequences (when in the unrelated dot-matches-newlines
/// mode).
@_spi(Foundation)
public var _nsreCompatibility: Regex<RegexOutput> {
wrapInOption(.nsreCompatibleDot, addingIf: true)
.wrapInOption(.unicodeScalarSemantics, addingIf: true)
}
}

/// A semantic level to use during regex matching.
Expand Down
38 changes: 37 additions & 1 deletion Tests/RegexTests/MatchTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

import XCTest
@testable import _RegexParser
@testable @_spi(RegexBenchmark) import _StringProcessing
@testable @_spi(RegexBenchmark) @_spi(Foundation) import _StringProcessing
import TestSupport

struct MatchError: Error {
Expand Down Expand Up @@ -2726,4 +2726,40 @@ extension RegexTests {
XCTAssertNotNil(str.wholeMatch(of: possessiveRegex))
}
}

func testNSRECompatibility() throws {
// NSRE-compatibility includes scalar matching, so `[\r\n]` should match
// either `\r` or `\n`.
let text = #"""
y=sin(x)+sin(2x)+sin(3x);\#rText "This is a function of x.";\r
"""#
let lineTerminationRegex = try Regex(#";[\r\n]"#)
._nsreCompatibility

let afterLine = try XCTUnwrap(text.firstRange(of: "Text"))
let match = try lineTerminationRegex.firstMatch(in: text)
XCTAssert(match?.range.upperBound == afterLine.lowerBound)

// NSRE-compatibility treats "dot" as special, in that it can match a
// newline sequence as well as a single Unicode scalar.
let aDotBRegex = try Regex(#"a.b"#)
._nsreCompatibility
.dotMatchesNewlines()
for input in ["a\rb", "a\nb", "a\r\nb"] {
XCTAssertNotNil(try aDotBRegex.wholeMatch(in: input))
}

// NSRE-compatibility doesn't give special treatment to newline sequences
// when matching other "match everything" regex patterns, like `[[^z]z]`,
// so this pattern doesn't match "a\r\nb".
let aCCBRegex = try Regex(#"a[[^z]z]b"#)
._nsreCompatibility
for input in ["a\rb", "a\nb", "a\r\nb"] {
if input.unicodeScalars.count == 3 {
XCTAssertNotNil(try aCCBRegex.wholeMatch(in: input))
} else {
XCTAssertNil(try aCCBRegex.wholeMatch(in: input))
}
}
}
}

0 comments on commit d56f16a

Please sign in to comment.