From 5ae3cf534e5868ae21382522561b1bfee13424cf Mon Sep 17 00:00:00 2001 From: rampaa Date: Sat, 28 Dec 2024 21:10:42 +0300 Subject: [PATCH] Speed up the FirstInvalidUnicodeSequenceIndex method --- JL.Core/Utilities/TextUtils.cs | 44 +++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/JL.Core/Utilities/TextUtils.cs b/JL.Core/Utilities/TextUtils.cs index b993e714..315e7d8f 100644 --- a/JL.Core/Utilities/TextUtils.cs +++ b/JL.Core/Utilities/TextUtils.cs @@ -6,28 +6,40 @@ namespace JL.Core.Utilities; public static class TextUtils { + private const char HighSurrogateStart = '\uD800'; + private const char Noncharacter = '\uFFFE'; + + // See https://github.com/dotnet/runtime/blob/main/src/libraries/System.Private.CoreLib/src/System/Globalization/Normalization.Icu.cs + // Modified from private static bool HasInvalidUnicodeSequence(ReadOnlySpan s) private static int FirstInvalidUnicodeSequenceIndex(ReadOnlySpan text) { - for (int i = 0; i < text.Length; i++) + for (int i = text.IndexOfAnyInRange(HighSurrogateStart, Noncharacter); (uint)i < (uint)text.Length; i++) { char c = text[i]; - if (c >= '\uD800') + if (c < HighSurrogateStart) + { + continue; + } + + if (c is Noncharacter) + { + return i; + } + + if (char.IsLowSurrogate(c)) + { + return i; + } + + if (char.IsHighSurrogate(c)) { - if (c is '\uFFFD' or '\uFFFE' or '\uFFFF' || char.IsLowSurrogate(c)) + if ((uint)(i + 1) >= (uint)text.Length || !char.IsLowSurrogate(text[i + 1])) { return i; } - if (char.IsHighSurrogate(c)) - { - if (i + 1 >= text.Length || !char.IsLowSurrogate(text[i + 1])) - { - return i; - } - - ++i; - } + ++i; } } @@ -38,20 +50,20 @@ private static string RemoveInvalidUnicodeSequences(string text, int index) { StringBuilder sb = new(text[..index], text.Length - 1); - for (int i = index + 1; i < text.Length; i++) + for (int i = index + 1; (uint)i < (uint)text.Length; i++) { char c = text[i]; - if (c < '\uD800') + if (c < HighSurrogateStart) { _ = sb.Append(c); } - else if (c is not '\uFFFD' and not '\uFFFE' and not '\uFFFF' && !char.IsLowSurrogate(c)) + else if (c is not Noncharacter && !char.IsLowSurrogate(c)) { if (char.IsHighSurrogate(c)) { - if (i + 1 < text.Length && char.IsLowSurrogate(text[i + 1])) + if ((uint)(i + 1) < (uint)text.Length && char.IsLowSurrogate(text[i + 1])) { _ = sb.Append(c).Append(text[i + 1]); ++i;