Adds SPI for a NSRE compatibility mode option (#698) (#702)

NSRegularExpression matches at the Unicode scalar level, but also matches `\r\n` sequences with a single `.` when single-line mode is enabled. This adds a `_nsreCompatibility` property that enables both of those behaviors, and implements support for the special case handling of `.`.
swiftlang · Dec 6, 2023 · d56f16a · d56f16a
1 parent 355027f
commit d56f16a
Show file tree

Hide file tree

Showing 6 changed files with 71 additions and 6 deletions.
diff --git a/Sources/_RegexParser/Regex/AST/MatchingOptions.swift b/Sources/_RegexParser/Regex/AST/MatchingOptions.swift
@@ -44,6 +44,9 @@ extension AST {
 
       // Swift-only default possessive quantifier
       case possessiveByDefault      // t.b.d.
+
+      // NSRegularExpression compatibility special-case
+      case nsreCompatibleDot        // no AST representation
     }
 
     public var kind: Kind

diff --git a/Sources/_RegexParser/Regex/Parse/Sema.swift b/Sources/_RegexParser/Regex/Parse/Sema.swift
@@ -142,7 +142,8 @@ extension RegexValidator {
 
     case .caseInsensitive, .possessiveByDefault, .reluctantByDefault,
         .singleLine, .multiline, .namedCapturesOnly, .extended, .extraExtended,
-        .asciiOnlyDigit, .asciiOnlyWord, .asciiOnlySpace, .asciiOnlyPOSIXProps:
+        .asciiOnlyDigit, .asciiOnlyWord, .asciiOnlySpace, .asciiOnlyPOSIXProps,
+        .nsreCompatibleDot:
       break
     }
   }

diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift
@@ -67,7 +67,7 @@ fileprivate extension Compiler.ByteCodeGen {
       emitAnyNonNewline()
 
     case .dot:
-      emitDot()
+      try emitDot()
 
     case let .char(c):
       emitCharacter(c)
@@ -238,9 +238,15 @@ fileprivate extension Compiler.ByteCodeGen {
     }
   }
 
-  mutating func emitDot() {
+  mutating func emitDot() throws {
     if options.dotMatchesNewline {
-      emitAny()
+      if options.usesNSRECompatibleDot {
+        try emitAlternation([
+          .atom(.characterClass(.newlineSequence)),
+          .atom(.anyNonNewline)])
+      } else {
+        emitAny()
+      }
     } else {
       emitAnyNonNewline()
     }
@@ -964,7 +970,7 @@ fileprivate extension Compiler.ByteCodeGen {
     case let .customCharacterClass(ccc):
       if ccc.containsDot {
         if !ccc.isInverted {
-          emitDot()
+          try emitDot()
         } else {
           throw Unsupported("Inverted any")
         }

diff --git a/Sources/_StringProcessing/MatchingOptions.swift b/Sources/_StringProcessing/MatchingOptions.swift
@@ -120,6 +120,10 @@ extension MatchingOptions {
       ? .graphemeCluster
       : .unicodeScalar
   }
+
+  var usesNSRECompatibleDot: Bool {
+    stack.last!.contains(.nsreCompatibleDot)
+  }
 }
 
 // MARK: - Implementation
@@ -141,6 +145,7 @@ extension MatchingOptions {
     // Not available via regex literal flags
     case transparentBounds
     case withoutAnchoringBounds
+    case nsreCompatibleDot
 
     // Oniguruma options
     case asciiOnlyDigit
@@ -197,6 +202,8 @@ extension MatchingOptions {
         self = .byteSemantics
       case .possessiveByDefault:
         self = .possessiveByDefault
+      case .nsreCompatibleDot:
+        self = .nsreCompatibleDot
 
       // Whitespace options are only relevant during parsing, not compilation.
       case .extended, .extraExtended:

diff --git a/Sources/_StringProcessing/Regex/Options.swift b/Sources/_StringProcessing/Regex/Options.swift
@@ -159,6 +159,18 @@ extension Regex {
       return wrapInOption(.unicodeScalarSemantics, addingIf: true)
     }
   }
+
+  /// Returns a regular expression that uses an NSRegularExpression
+  /// compatibility mode.
+  ///
+  /// This mode includes using Unicode scalar semantics and treating a `dot`
+  /// as matching newline sequences (when in the unrelated dot-matches-newlines
+  /// mode).
+  @_spi(Foundation)
+  public var _nsreCompatibility: Regex<RegexOutput> {
+    wrapInOption(.nsreCompatibleDot, addingIf: true)
+      .wrapInOption(.unicodeScalarSemantics, addingIf: true)
+  }
 }
 
 /// A semantic level to use during regex matching.

diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift
@@ -11,7 +11,7 @@
 
 import XCTest
 @testable import _RegexParser
-@testable @_spi(RegexBenchmark) import _StringProcessing
+@testable @_spi(RegexBenchmark) @_spi(Foundation) import _StringProcessing
 import TestSupport
 
 struct MatchError: Error {
@@ -2726,4 +2726,40 @@ extension RegexTests {
       XCTAssertNotNil(str.wholeMatch(of: possessiveRegex))
     }
   }
+
+  func testNSRECompatibility() throws {
+    // NSRE-compatibility includes scalar matching, so `[\r\n]` should match
+    // either `\r` or `\n`.
+    let text = #"""
+      y=sin(x)+sin(2x)+sin(3x);\#rText "This is a function of x.";\r
+      """#
+    let lineTerminationRegex = try Regex(#";[\r\n]"#)
+      ._nsreCompatibility
+
+    let afterLine = try XCTUnwrap(text.firstRange(of: "Text"))
+    let match = try lineTerminationRegex.firstMatch(in: text)
+    XCTAssert(match?.range.upperBound == afterLine.lowerBound)
+
+    // NSRE-compatibility treats "dot" as special, in that it can match a
+    // newline sequence as well as a single Unicode scalar.
+    let aDotBRegex = try Regex(#"a.b"#)
+      ._nsreCompatibility
+      .dotMatchesNewlines()
+    for input in ["a\rb", "a\nb", "a\r\nb"] {
+      XCTAssertNotNil(try aDotBRegex.wholeMatch(in: input))
+    }
+
+    // NSRE-compatibility doesn't give special treatment to newline sequences
+    // when matching other "match everything" regex patterns, like `[[^z]z]`,
+    // so this pattern doesn't match "a\r\nb".
+    let aCCBRegex = try Regex(#"a[[^z]z]b"#)
+      ._nsreCompatibility
+    for input in ["a\rb", "a\nb", "a\r\nb"] {
+      if input.unicodeScalars.count == 3 {
+        XCTAssertNotNil(try aCCBRegex.wholeMatch(in: input))
+      } else {
+        XCTAssertNil(try aCCBRegex.wholeMatch(in: input))
+      }
+    }
+  }
 }