JSONFlattenerMaker: Speed up charsetFix. (apache#16212)

JSON parsing has this function "charsetFix" that fixes up strings so they can round-trip through UTF-8 encoding without loss of fidelity. It was originally introduced to fix a bug where strings could be sorted, encoded, then decoded, and the resulting decoded strings could end up no longer in sorted order (due to character swaps during the encode operation). The code has been in place for some time, and only applies to JSON. I am not sure if it needs to apply to other formats; it's certainly more difficult to get broken strings from other formats. It's easy in JSON because you can write a JSON string like "foo\uD900". At any rate, this patch does not revisit whether charsetFix should be applied to all formats. It merely optimizes it for the JSON case. The function works by using CharsetEncoder.canEncode, which is a relatively slow method (just as expensive as actually encoding). This patch adds a short-circuit to skip canEncode if all chars in a string are in the basic multilingual plane (i.e. if no chars are surrogates).
monetate · Apr 26, 2024 · 64a6fc8 · 64a6fc8
1 parent 9a2d7c2
commit 64a6fc8
Show file tree

Hide file tree

Showing 2 changed files with 54 additions and 5 deletions.
diff --git a/processing/src/main/java/org/apache/druid/java/util/common/parsers/JSONFlattenerMaker.java b/processing/src/main/java/org/apache/druid/java/util/common/parsers/JSONFlattenerMaker.java
@@ -203,19 +203,53 @@ public static Object convertJsonNode(JsonNode val, CharsetEncoder enc)
     return null;
   }
 
+  /**
+   * Fix up a string so it can round-trip through UTF-8 encoding without loss of fidelity. This comes up when a string
+   * has surrogates (e.g. \uD900) that appear in invalid positions, such as alone rather than as part of a
+   * surrogate pair.
+   *
+   * This operation is useful because when a string cannot round-trip properly, it can cause it to sort differently
+   * relative to other strings after an encode/decode round-trip. This causes incorrect ordering in cases where strings
+   * are sorted, then encoded, then decoded again.
+   *
+   * @param s   string, or null
+   * @param enc UTF-8 encoder
+   *
+   * @return the original string, or a fixed version that can round-trip properly
+   */
   @Nullable
-  private static String charsetFix(String s, CharsetEncoder enc)
+  static String charsetFix(@Nullable String s, CharsetEncoder enc)
   {
-    if (s != null && !enc.canEncode(s)) {
-      // Some whacky characters are in this string (e.g. \uD900). These are problematic because they are decodeable
-      // by new String(...) but will not encode into the same character. This dance here will replace these
-      // characters with something more sane.
+    if (s != null && !isBmp(s) && !enc.canEncode(s)) {
+      // Note: the check isBmp isn't necessary for correct behavior, but it improves performance in the common case
+      // where all characters are in BMP. It short-circuits "enc.canEncode", which is a slow operation. The short
+      // circuit is valid because if every char in a string is in BMP, it is definitely encodable as UTF-8.
+
+      // This dance here will replace the original string with one that can be round-tripped.
       return StringUtils.fromUtf8(StringUtils.toUtf8(s));
     } else {
       return s;
     }
   }
 
+  /**
+   * Returns whether every character in a string is in BMP (basic multilingual plane).
+   */
+  private static boolean isBmp(String s)
+  {
+    for (int i = 0; i < s.length(); i++) {
+      final char c = s.charAt(i);
+
+      // All 16-bit code units are either valid code points, or surrogates. So if this char is not a surrogate,
+      // it must represent a code point in BMP.
+      if (Character.isSurrogate(c)) {
+        return false;
+      }
+    }
+
+    return true;
+  }
+
   private static boolean isFlatList(JsonNode list)
   {
     for (JsonNode obj : list) {

diff --git a/...ssing/src/test/java/org/apache/druid/java/util/common/parsers/JSONFlattenerMakerTest.java b/...ssing/src/test/java/org/apache/druid/java/util/common/parsers/JSONFlattenerMakerTest.java
@@ -31,6 +31,8 @@
 import org.junit.Test;
 
 import java.math.BigInteger;
+import java.nio.charset.CharsetEncoder;
+import java.nio.charset.StandardCharsets;
 import java.util.List;
 import java.util.Map;
 
@@ -199,4 +201,17 @@ public void testDiscovery() throws JsonProcessingException
         ImmutableSet.copyOf(FLATTENER_MAKER_NESTED.discoverRootFields(node))
     );
   }
+
+  @Test
+  public void testCharsetFix()
+  {
+    final CharsetEncoder encoder = StandardCharsets.UTF_8.newEncoder();
+    Assert.assertEquals("hello", JSONFlattenerMaker.charsetFix("hello", encoder));
+    Assert.assertEquals("Apache® Druid", JSONFlattenerMaker.charsetFix("Apache® Druid", encoder));
+    Assert.assertEquals("hello?", JSONFlattenerMaker.charsetFix("hello\uD900", encoder));
+    Assert.assertEquals("hello?", JSONFlattenerMaker.charsetFix("hello\uD83D", encoder));
+    Assert.assertEquals("hello?", JSONFlattenerMaker.charsetFix("hello\uDCAF", encoder));
+    Assert.assertEquals("hello💯", JSONFlattenerMaker.charsetFix("hello\uD83D\uDCAF", encoder));
+    Assert.assertEquals("héllö", JSONFlattenerMaker.charsetFix("héllö", encoder));
+  }
 }