From 5ae3cf534e5868ae21382522561b1bfee13424cf Mon Sep 17 00:00:00 2001
From: rampaa <rampaa@waifuland.com>
Date: Sat, 28 Dec 2024 21:10:42 +0300
Subject: [PATCH] Speed up the FirstInvalidUnicodeSequenceIndex method

---
 JL.Core/Utilities/TextUtils.cs | 44 +++++++++++++++++++++-------------
 1 file changed, 28 insertions(+), 16 deletions(-)
diff --git a/JL.Core/Utilities/TextUtils.cs b/JL.Core/Utilities/TextUtils.cs
index b993e714..315e7d8f 100644
--- a/JL.Core/Utilities/TextUtils.cs
+++ b/JL.Core/Utilities/TextUtils.cs
@@ -6,28 +6,40 @@ namespace JL.Core.Utilities;
 
 public static class TextUtils
 {
+    private const char HighSurrogateStart = '\uD800';
+    private const char Noncharacter = '\uFFFE';
+
+    // See https://github.com/dotnet/runtime/blob/main/src/libraries/System.Private.CoreLib/src/System/Globalization/Normalization.Icu.cs
+    // Modified from private static bool HasInvalidUnicodeSequence(ReadOnlySpan<char> s)
     private static int FirstInvalidUnicodeSequenceIndex(ReadOnlySpan<char> text)
     {
-        for (int i = 0; i < text.Length; i++)
+        for (int i = text.IndexOfAnyInRange(HighSurrogateStart, Noncharacter); (uint)i < (uint)text.Length; i++)
         {
             char c = text[i];
 
-            if (c >= '\uD800')
+            if (c < HighSurrogateStart)
+            {
+                continue;
+            }
+
+            if (c is Noncharacter)
+            {
+                return i;
+            }
+
+            if (char.IsLowSurrogate(c))
+            {
+                return i;
+            }
+
+            if (char.IsHighSurrogate(c))
             {
-                if (c is '\uFFFD' or '\uFFFE' or '\uFFFF' || char.IsLowSurrogate(c))
+                if ((uint)(i + 1) >= (uint)text.Length || !char.IsLowSurrogate(text[i + 1]))
                 {
                     return i;
                 }
 
-                if (char.IsHighSurrogate(c))
-                {
-                    if (i + 1 >= text.Length || !char.IsLowSurrogate(text[i + 1]))
-                    {
-                        return i;
-                    }
-
-                    ++i;
-                }
+                ++i;
             }
         }
 
@@ -38,20 +50,20 @@ private static string RemoveInvalidUnicodeSequences(string text, int index)
     {
         StringBuilder sb = new(text[..index], text.Length - 1);
 
-        for (int i = index + 1; i < text.Length; i++)
+        for (int i = index + 1; (uint)i < (uint)text.Length; i++)
         {
             char c = text[i];
 
-            if (c < '\uD800')
+            if (c < HighSurrogateStart)
             {
                 _ = sb.Append(c);
             }
 
-            else if (c is not '\uFFFD' and not '\uFFFE' and not '\uFFFF' && !char.IsLowSurrogate(c))
+            else if (c is not Noncharacter && !char.IsLowSurrogate(c))
             {
                 if (char.IsHighSurrogate(c))
                 {
-                    if (i + 1 < text.Length && char.IsLowSurrogate(text[i + 1]))
+                    if ((uint)(i + 1) < (uint)text.Length && char.IsLowSurrogate(text[i + 1]))
                     {
                         _ = sb.Append(c).Append(text[i + 1]);
                         ++i;