Skip to content

Commit

Permalink
Speed up the FirstInvalidUnicodeSequenceIndex method
Browse files Browse the repository at this point in the history
  • Loading branch information
rampaa committed Dec 28, 2024
1 parent 910e092 commit 5ae3cf5
Showing 1 changed file with 28 additions and 16 deletions.
44 changes: 28 additions & 16 deletions JL.Core/Utilities/TextUtils.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,28 +6,40 @@ namespace JL.Core.Utilities;

public static class TextUtils
{
private const char HighSurrogateStart = '\uD800';
private const char Noncharacter = '\uFFFE';

// See https://github.com/dotnet/runtime/blob/main/src/libraries/System.Private.CoreLib/src/System/Globalization/Normalization.Icu.cs
// Modified from private static bool HasInvalidUnicodeSequence(ReadOnlySpan<char> s)
private static int FirstInvalidUnicodeSequenceIndex(ReadOnlySpan<char> text)
{
for (int i = 0; i < text.Length; i++)
for (int i = text.IndexOfAnyInRange(HighSurrogateStart, Noncharacter); (uint)i < (uint)text.Length; i++)
{
char c = text[i];

if (c >= '\uD800')
if (c < HighSurrogateStart)
{
continue;
}

if (c is Noncharacter)
{
return i;
}

if (char.IsLowSurrogate(c))
{
return i;
}

if (char.IsHighSurrogate(c))
{
if (c is '\uFFFD' or '\uFFFE' or '\uFFFF' || char.IsLowSurrogate(c))
if ((uint)(i + 1) >= (uint)text.Length || !char.IsLowSurrogate(text[i + 1]))
{
return i;
}

if (char.IsHighSurrogate(c))
{
if (i + 1 >= text.Length || !char.IsLowSurrogate(text[i + 1]))
{
return i;
}

++i;
}
++i;
}
}

Expand All @@ -38,20 +50,20 @@ private static string RemoveInvalidUnicodeSequences(string text, int index)
{
StringBuilder sb = new(text[..index], text.Length - 1);

for (int i = index + 1; i < text.Length; i++)
for (int i = index + 1; (uint)i < (uint)text.Length; i++)
{
char c = text[i];

if (c < '\uD800')
if (c < HighSurrogateStart)
{
_ = sb.Append(c);
}

else if (c is not '\uFFFD' and not '\uFFFE' and not '\uFFFF' && !char.IsLowSurrogate(c))
else if (c is not Noncharacter && !char.IsLowSurrogate(c))
{
if (char.IsHighSurrogate(c))
{
if (i + 1 < text.Length && char.IsLowSurrogate(text[i + 1]))
if ((uint)(i + 1) < (uint)text.Length && char.IsLowSurrogate(text[i + 1]))
{
_ = sb.Append(c).Append(text[i + 1]);
++i;
Expand Down

0 comments on commit 5ae3cf5

Please sign in to comment.