diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java index 92b738df7..d237633e7 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java @@ -1,5 +1,6 @@ package org.unicode.text.UCD; +import com.google.common.base.Strings; import com.ibm.icu.dev.tool.UOption; import com.ibm.icu.dev.util.UnicodeMap; import com.ibm.icu.lang.UCharacter; @@ -26,6 +27,7 @@ import java.util.Stack; import java.util.TreeMap; import java.util.function.Function; +import java.util.regex.MatchResult; import java.util.regex.Pattern; import java.util.stream.Collectors; import org.unicode.cldr.draft.FileUtilities; @@ -2007,11 +2009,15 @@ public BackwardParseException(String s, int errorOffset) { } } + private static Pattern nameEscape = Pattern.compile("\\\\N\\{[^}]*\\}"); + public static UnicodeSet parseUnicodeSet(String source, ParsePosition pp) throws ParseException { + final int initialPosition = pp.getIndex(); + UnicodeSet icuSet; try { - final var result = new UnicodeSet(source, pp, symbolTable); - return result; + // Let ICU figure out where the UnicodeSet expression ends. + icuSet = new UnicodeSet(source, pp, symbolTable); } catch (IllegalArgumentException e) { // ICU produces unhelpful messages when parsing UnicodeSet deep into // a large string in a string that contains line terminators, as the @@ -2019,5 +2025,47 @@ public static UnicodeSet parseUnicodeSet(String source, ParsePosition pp) final String message = e.getMessage().split(" at \"", 2)[0]; throw new BackwardParseException(message, pp.getIndex()); } + String unicodeSetExpression = source.substring(initialPosition, pp.getIndex()); + // ICU incorrectly treats \N{X} as a synonym for \p{Name=X}, returning a + // set rather than a character, so that it can be empty, and so that + // \N{X}-\N{Y} is a set difference (equal to \N{X}) rather than the range \N{X}-\N{Y}. + // This should likely be fixed in ICU, but in the meantime we need to work around it in + // the invariant before someone gets hurt. + var matcher = nameEscape.matcher(unicodeSetExpression); + if (!matcher.find()) { + return icuSet; + } + // Simplest way for the lambda function to report errors. + // It cannot throw a ParseException, and it cannot modify local variables. + // It _can_ modify what this local variable points to. + // Below, we will throw a ParseException for the first bad position. + final List badEscapePositions = new ArrayList<>(); + unicodeSetExpression = + matcher.replaceAll( + (MatchResult match) -> { + UnicodeSet character = + new UnicodeSet( + match.group(), new ParsePosition(0), symbolTable); + if (character.isEmpty()) { + badEscapePositions.add(match.start()); + return ""; + } + return Strings.padStart( + "\\\\x{" + Integer.toHexString(character.charAt(0)) + "}", + match.group().length() + 1, + ' '); + }); + for (int p : badEscapePositions) { + // Simplest way to throw an exception for only the first list element. + throw new ParseException("No character matching \\N escape", initialPosition + p); + } + var patchedParsePosition = new ParsePosition(0); + try { + return new UnicodeSet(unicodeSetExpression, patchedParsePosition, symbolTable); + } catch (IllegalArgumentException e) { + final String message = e.getMessage().split(" at \"", 2)[0]; + throw new BackwardParseException( + message, patchedParsePosition.getIndex() + initialPosition); + } } } diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt index 0c4533f04..34c9de0d2 100644 --- a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt +++ b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt @@ -311,7 +311,7 @@ In \P{U-1:gc=Cn}, R-1:NFKC_Simple_Casefold = NFKC_Simple_Casefold # Red Flag: cased and case_ignorable should be disjoint, # except for modifier letters and ◌ͅ. -\p{Cased} ∥ [\p{Case_Ignorable} - \p{gc=Lm} - \N{COMBINING GREEK YPOGEGRAMMENI} ] +\p{Cased} ∥ [\p{Case_Ignorable} - \p{gc=Lm} - [\N{COMBINING GREEK YPOGEGRAMMENI}] ] ########################## # Property Stability Policies @@ -837,8 +837,8 @@ Let $QUInclusions := [\u275F-\u2760 \U0001F676-\U0001F678 \u0022 \u0027 \u275B-\ # 7.0 added 275F..2760 1F676..1F678 \p{LB=QU} = [\p{GC=Pf} \p{GC=Pi} $QUInclusions] \p{LB=SG} = \p{GC=Cs} -\p{LB=SP} = \N{SPACE} -\p{LB=SY} = \N{SOLIDUS} +\p{LB=SP} = [\N{SPACE}] +\p{LB=SY} = [\N{SOLIDUS}] # The classes WJ and ZW are reserved for characters meant as explicit overrides # to the line breaking algorithms. For historical reasons, WJ has two such @@ -848,7 +848,7 @@ Let $QUInclusions := [\u275F-\u2760 \U0001F676-\U0001F678 \u0022 \u0027 \u275B-\ # behaviour requires the prohibition of line breaks on either side should have # class GL. \p{LB=WJ} = [\N{WORD JOINER} \N{ZERO WIDTH NO-BREAK SPACE}] -\p{LB=ZW} = \N{ZERO WIDTH SPACE} +\p{LB=ZW} = [\N{ZERO WIDTH SPACE}] \p{LB=ZWJ} = [\u200D] \p{LB=RI} = \p{RI=Y} @@ -1249,7 +1249,7 @@ Let $cjkStrokes := \p{Name=/^CJK STROKE /} Let $kangxiRadicals := \p{Name=/^KANGXI RADICAL /} Let $cjkRadicals := \p{Name=/^CJK RADICAL /} Let $strokesAndRadicals := [ $cjkStrokes $kangxiRadicals $cjkRadicals ] -Let $nonIdeographicRadicals := \N{CJK RADICAL REPEAT} +Let $nonIdeographicRadicals := [\N{CJK RADICAL REPEAT}] # The following set may expand over time, if strokes are added. # It can also shrink, if single-stroke ideographs are encoded. Let $nonIdeographicStrokes := \p{Name=/^CJK STROKE (T|WG|XG|BXG|SW|HZZ|HP|HZWG|SZWG|HZT|HZZP|HPWG|HZW|HZZZ|PG|Q|HXG|SZP)$/} diff --git a/unicodetools/src/test/java/org/unicode/text/UCD/TestTestUnicodeInvariants.java b/unicodetools/src/test/java/org/unicode/text/UCD/TestTestUnicodeInvariants.java index 9fe411807..279d6dcaa 100644 --- a/unicodetools/src/test/java/org/unicode/text/UCD/TestTestUnicodeInvariants.java +++ b/unicodetools/src/test/java/org/unicode/text/UCD/TestTestUnicodeInvariants.java @@ -3,9 +3,13 @@ import static org.junit.jupiter.api.Assertions.assertAll; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; import java.io.IOException; +import java.text.ParseException; +import java.text.ParsePosition; import org.junit.jupiter.api.Test; +import org.unicode.text.UCD.TestUnicodeInvariants.BackwardParseException; import org.unicode.text.utility.Settings; public class TestTestUnicodeInvariants { @@ -49,4 +53,31 @@ void testSecurityInvariants() throws IOException { TestUnicodeInvariants.testInvariants("SecurityInvariantTest.txt", "security", true); assertEquals(0, rc, "TestUnicodeInvariants.testInvariants(security) failed"); } + + @Test + void testUnicodeSetParsing() throws ParseException { + assertEquals( + 26, + TestUnicodeInvariants.parseUnicodeSet( + "TEST [\\N{LATIN SMALL LETTER A}-\\N{LATIN SMALL LETTER Z}]", + new ParsePosition(5)) + .size()); + ParseException thrown = + assertThrows( + ParseException.class, + () -> + TestUnicodeInvariants.parseUnicodeSet( + "TEST [\\N{MEOW}]", new ParsePosition(5))); + assertEquals("No character matching \\N escape", thrown.getMessage()); + assertEquals("TEST [".length(), thrown.getErrorOffset()); + thrown = + assertThrows( + BackwardParseException.class, + () -> + TestUnicodeInvariants.parseUnicodeSet( + "TEST [[a-z]-\\N{LATIN SMALL LETTER Z}]", + new ParsePosition(5))); + assertEquals("Error: Set expected after operator", thrown.getMessage()); + assertEquals("TEST [[a-z]-.N{LATIN SMALL LETTER Z}".length(), thrown.getErrorOffset()); + } }