diff --git a/processing/src/main/java/org/apache/druid/java/util/common/parsers/JSONFlattenerMaker.java b/processing/src/main/java/org/apache/druid/java/util/common/parsers/JSONFlattenerMaker.java index 5df98199080f..84490b19fce0 100644 --- a/processing/src/main/java/org/apache/druid/java/util/common/parsers/JSONFlattenerMaker.java +++ b/processing/src/main/java/org/apache/druid/java/util/common/parsers/JSONFlattenerMaker.java @@ -203,19 +203,53 @@ public static Object convertJsonNode(JsonNode val, CharsetEncoder enc) return null; } + /** + * Fix up a string so it can round-trip through UTF-8 encoding without loss of fidelity. This comes up when a string + * has surrogates (e.g. \uD900) that appear in invalid positions, such as alone rather than as part of a + * surrogate pair. + * + * This operation is useful because when a string cannot round-trip properly, it can cause it to sort differently + * relative to other strings after an encode/decode round-trip. This causes incorrect ordering in cases where strings + * are sorted, then encoded, then decoded again. + * + * @param s string, or null + * @param enc UTF-8 encoder + * + * @return the original string, or a fixed version that can round-trip properly + */ @Nullable - private static String charsetFix(String s, CharsetEncoder enc) + static String charsetFix(@Nullable String s, CharsetEncoder enc) { - if (s != null && !enc.canEncode(s)) { - // Some whacky characters are in this string (e.g. \uD900). These are problematic because they are decodeable - // by new String(...) but will not encode into the same character. This dance here will replace these - // characters with something more sane. + if (s != null && !isBmp(s) && !enc.canEncode(s)) { + // Note: the check isBmp isn't necessary for correct behavior, but it improves performance in the common case + // where all characters are in BMP. It short-circuits "enc.canEncode", which is a slow operation. The short + // circuit is valid because if every char in a string is in BMP, it is definitely encodable as UTF-8. + + // This dance here will replace the original string with one that can be round-tripped. return StringUtils.fromUtf8(StringUtils.toUtf8(s)); } else { return s; } } + /** + * Returns whether every character in a string is in BMP (basic multilingual plane). + */ + private static boolean isBmp(String s) + { + for (int i = 0; i < s.length(); i++) { + final char c = s.charAt(i); + + // All 16-bit code units are either valid code points, or surrogates. So if this char is not a surrogate, + // it must represent a code point in BMP. + if (Character.isSurrogate(c)) { + return false; + } + } + + return true; + } + private static boolean isFlatList(JsonNode list) { for (JsonNode obj : list) { diff --git a/processing/src/test/java/org/apache/druid/java/util/common/parsers/JSONFlattenerMakerTest.java b/processing/src/test/java/org/apache/druid/java/util/common/parsers/JSONFlattenerMakerTest.java index 5583081db10e..b471c24b40d1 100644 --- a/processing/src/test/java/org/apache/druid/java/util/common/parsers/JSONFlattenerMakerTest.java +++ b/processing/src/test/java/org/apache/druid/java/util/common/parsers/JSONFlattenerMakerTest.java @@ -31,6 +31,8 @@ import org.junit.Test; import java.math.BigInteger; +import java.nio.charset.CharsetEncoder; +import java.nio.charset.StandardCharsets; import java.util.List; import java.util.Map; @@ -199,4 +201,17 @@ public void testDiscovery() throws JsonProcessingException ImmutableSet.copyOf(FLATTENER_MAKER_NESTED.discoverRootFields(node)) ); } + + @Test + public void testCharsetFix() + { + final CharsetEncoder encoder = StandardCharsets.UTF_8.newEncoder(); + Assert.assertEquals("hello", JSONFlattenerMaker.charsetFix("hello", encoder)); + Assert.assertEquals("Apache® Druid", JSONFlattenerMaker.charsetFix("Apache® Druid", encoder)); + Assert.assertEquals("hello?", JSONFlattenerMaker.charsetFix("hello\uD900", encoder)); + Assert.assertEquals("hello?", JSONFlattenerMaker.charsetFix("hello\uD83D", encoder)); + Assert.assertEquals("hello?", JSONFlattenerMaker.charsetFix("hello\uDCAF", encoder)); + Assert.assertEquals("hello💯", JSONFlattenerMaker.charsetFix("hello\uD83D\uDCAF", encoder)); + Assert.assertEquals("héllö", JSONFlattenerMaker.charsetFix("héllö", encoder)); + } }