Work around a UnicodeSet bug (#908)

Co-authored-by: Markus Scherer <[email protected]>
unicode-org · Aug 9, 2024 · 5845839 · 5845839
1 parent cccbe93
commit 5845839
Show file tree

Hide file tree

Showing 3 changed files with 86 additions and 7 deletions.
diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java
@@ -1,5 +1,6 @@
 package org.unicode.text.UCD;
 
+import com.google.common.base.Strings;
 import com.ibm.icu.dev.tool.UOption;
 import com.ibm.icu.dev.util.UnicodeMap;
 import com.ibm.icu.lang.UCharacter;
@@ -26,6 +27,7 @@
 import java.util.Stack;
 import java.util.TreeMap;
 import java.util.function.Function;
+import java.util.regex.MatchResult;
 import java.util.regex.Pattern;
 import java.util.stream.Collectors;
 import org.unicode.cldr.draft.FileUtilities;
@@ -2007,17 +2009,63 @@ public BackwardParseException(String s, int errorOffset) {
         }
     }
 
+    private static Pattern nameEscape = Pattern.compile("\\\\N\\{[^}]*\\}");
+
     public static UnicodeSet parseUnicodeSet(String source, ParsePosition pp)
             throws ParseException {
+        final int initialPosition = pp.getIndex();
+        UnicodeSet icuSet;
         try {
-            final var result = new UnicodeSet(source, pp, symbolTable);
-            return result;
+            // Let ICU figure out where the UnicodeSet expression ends.
+            icuSet = new UnicodeSet(source, pp, symbolTable);
         } catch (IllegalArgumentException e) {
             // ICU produces unhelpful messages when parsing UnicodeSet deep into
             // a large string in a string that contains line terminators, as the
             // whole string is escaped and printed.
             final String message = e.getMessage().split(" at \"", 2)[0];
             throw new BackwardParseException(message, pp.getIndex());
         }
+        String unicodeSetExpression = source.substring(initialPosition, pp.getIndex());
+        // ICU incorrectly treats \N{X} as a synonym for \p{Name=X}, returning a
+        // set rather than a character, so that it can be empty, and so that
+        // \N{X}-\N{Y} is a set difference (equal to \N{X}) rather than the range \N{X}-\N{Y}.
+        // This should likely be fixed in ICU, but in the meantime we need to work around it in
+        // the invariant before someone gets hurt.
+        var matcher = nameEscape.matcher(unicodeSetExpression);
+        if (!matcher.find()) {
+            return icuSet;
+        }
+        // Simplest way for the lambda function to report errors.
+        // It cannot throw a ParseException, and it cannot modify local variables.
+        // It _can_ modify what this local variable points to.
+        // Below, we will throw a ParseException for the first bad position.
+        final List<Integer> badEscapePositions = new ArrayList<>();
+        unicodeSetExpression =
+                matcher.replaceAll(
+                        (MatchResult match) -> {
+                            UnicodeSet character =
+                                    new UnicodeSet(
+                                            match.group(), new ParsePosition(0), symbolTable);
+                            if (character.isEmpty()) {
+                                badEscapePositions.add(match.start());
+                                return "";
+                            }
+                            return Strings.padStart(
+                                    "\\\\x{" + Integer.toHexString(character.charAt(0)) + "}",
+                                    match.group().length() + 1,
+                                    ' ');
+                        });
+        for (int p : badEscapePositions) {
+            // Simplest way to throw an exception for only the first list element.
+            throw new ParseException("No character matching \\N escape", initialPosition + p);
+        }
+        var patchedParsePosition = new ParsePosition(0);
+        try {
+            return new UnicodeSet(unicodeSetExpression, patchedParsePosition, symbolTable);
+        } catch (IllegalArgumentException e) {
+            final String message = e.getMessage().split(" at \"", 2)[0];
+            throw new BackwardParseException(
+                    message, patchedParsePosition.getIndex() + initialPosition);
+        }
     }
 }
diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt
@@ -311,7 +311,7 @@ In \P{U-1:gc=Cn}, R-1:NFKC_Simple_Casefold = NFKC_Simple_Casefold
 # Red Flag: cased and case_ignorable should be disjoint,
 # except for modifier letters and ◌ͅ.
 
-\p{Cased} ∥ [\p{Case_Ignorable} - \p{gc=Lm} - \N{COMBINING GREEK YPOGEGRAMMENI} ]
+\p{Cased} ∥ [\p{Case_Ignorable} - \p{gc=Lm} - [\N{COMBINING GREEK YPOGEGRAMMENI}] ]
 
 ##########################
 # Property Stability Policies
@@ -837,8 +837,8 @@ Let $QUInclusions := [\u275F-\u2760 \U0001F676-\U0001F678 \u0022 \u0027 \u275B-\
 # 7.0 added 275F..2760 1F676..1F678
 \p{LB=QU} = [\p{GC=Pf} \p{GC=Pi} $QUInclusions]
 \p{LB=SG} = \p{GC=Cs}
-\p{LB=SP} = \N{SPACE}
-\p{LB=SY} = \N{SOLIDUS}
+\p{LB=SP} = [\N{SPACE}]
+\p{LB=SY} = [\N{SOLIDUS}]
 
 # The classes WJ and ZW are reserved for characters meant as explicit overrides
 # to the line breaking algorithms.  For historical reasons, WJ has two such
@@ -848,7 +848,7 @@ Let $QUInclusions := [\u275F-\u2760 \U0001F676-\U0001F678 \u0022 \u0027 \u275B-\
 # behaviour requires the prohibition of line breaks on either side should have
 # class GL.
 \p{LB=WJ} = [\N{WORD JOINER} \N{ZERO WIDTH NO-BREAK SPACE}]
-\p{LB=ZW} = \N{ZERO WIDTH SPACE}
+\p{LB=ZW} = [\N{ZERO WIDTH SPACE}]
 
 \p{LB=ZWJ} = [\u200D]
 \p{LB=RI} = \p{RI=Y}
@@ -1249,7 +1249,7 @@ Let $cjkStrokes := \p{Name=/^CJK STROKE /}
 Let $kangxiRadicals := \p{Name=/^KANGXI RADICAL /}
 Let $cjkRadicals := \p{Name=/^CJK RADICAL /}
 Let $strokesAndRadicals := [ $cjkStrokes $kangxiRadicals $cjkRadicals ]
-Let $nonIdeographicRadicals := \N{CJK RADICAL REPEAT}
+Let $nonIdeographicRadicals := [\N{CJK RADICAL REPEAT}]
 # The following set may expand over time, if strokes are added.
 # It can also shrink, if single-stroke ideographs are encoded.
 Let $nonIdeographicStrokes := \p{Name=/^CJK STROKE (T|WG|XG|BXG|SW|HZZ|HP|HZWG|SZWG|HZT|HZZP|HPWG|HZW|HZZZ|PG|Q|HXG|SZP)$/}

diff --git a/unicodetools/src/test/java/org/unicode/text/UCD/TestTestUnicodeInvariants.java b/unicodetools/src/test/java/org/unicode/text/UCD/TestTestUnicodeInvariants.java
@@ -3,9 +3,13 @@
 import static org.junit.jupiter.api.Assertions.assertAll;
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertThrows;
 
 import java.io.IOException;
+import java.text.ParseException;
+import java.text.ParsePosition;
 import org.junit.jupiter.api.Test;
+import org.unicode.text.UCD.TestUnicodeInvariants.BackwardParseException;
 import org.unicode.text.utility.Settings;
 
 public class TestTestUnicodeInvariants {
@@ -49,4 +53,31 @@ void testSecurityInvariants() throws IOException {
                 TestUnicodeInvariants.testInvariants("SecurityInvariantTest.txt", "security", true);
         assertEquals(0, rc, "TestUnicodeInvariants.testInvariants(security) failed");
     }
+
+    @Test
+    void testUnicodeSetParsing() throws ParseException {
+        assertEquals(
+                26,
+                TestUnicodeInvariants.parseUnicodeSet(
+                                "TEST [\\N{LATIN SMALL LETTER A}-\\N{LATIN SMALL LETTER Z}]",
+                                new ParsePosition(5))
+                        .size());
+        ParseException thrown =
+                assertThrows(
+                        ParseException.class,
+                        () ->
+                                TestUnicodeInvariants.parseUnicodeSet(
+                                        "TEST [\\N{MEOW}]", new ParsePosition(5)));
+        assertEquals("No character matching \\N escape", thrown.getMessage());
+        assertEquals("TEST [".length(), thrown.getErrorOffset());
+        thrown =
+                assertThrows(
+                        BackwardParseException.class,
+                        () ->
+                                TestUnicodeInvariants.parseUnicodeSet(
+                                        "TEST [[a-z]-\\N{LATIN SMALL LETTER Z}]",
+                                        new ParsePosition(5)));
+        assertEquals("Error: Set expected after operator", thrown.getMessage());
+        assertEquals("TEST [[a-z]-.N{LATIN SMALL LETTER Z}".length(), thrown.getErrorOffset());
+    }
 }