Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Work around a UnicodeSet bug #908

Merged
merged 7 commits into from
Aug 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package org.unicode.text.UCD;

import com.google.common.base.Strings;
import com.ibm.icu.dev.tool.UOption;
import com.ibm.icu.dev.util.UnicodeMap;
import com.ibm.icu.lang.UCharacter;
Expand All @@ -26,6 +27,7 @@
import java.util.Stack;
import java.util.TreeMap;
import java.util.function.Function;
import java.util.regex.MatchResult;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.unicode.cldr.draft.FileUtilities;
Expand Down Expand Up @@ -2007,17 +2009,63 @@ public BackwardParseException(String s, int errorOffset) {
}
}

private static Pattern nameEscape = Pattern.compile("\\\\N\\{[^}]*\\}");

public static UnicodeSet parseUnicodeSet(String source, ParsePosition pp)
throws ParseException {
final int initialPosition = pp.getIndex();
UnicodeSet icuSet;
try {
final var result = new UnicodeSet(source, pp, symbolTable);
return result;
// Let ICU figure out where the UnicodeSet expression ends.
icuSet = new UnicodeSet(source, pp, symbolTable);
} catch (IllegalArgumentException e) {
// ICU produces unhelpful messages when parsing UnicodeSet deep into
// a large string in a string that contains line terminators, as the
// whole string is escaped and printed.
final String message = e.getMessage().split(" at \"", 2)[0];
throw new BackwardParseException(message, pp.getIndex());
}
String unicodeSetExpression = source.substring(initialPosition, pp.getIndex());
// ICU incorrectly treats \N{X} as a synonym for \p{Name=X}, returning a
// set rather than a character, so that it can be empty, and so that
// \N{X}-\N{Y} is a set difference (equal to \N{X}) rather than the range \N{X}-\N{Y}.
// This should likely be fixed in ICU, but in the meantime we need to work around it in
// the invariant before someone gets hurt.
var matcher = nameEscape.matcher(unicodeSetExpression);
if (!matcher.find()) {
return icuSet;
}
// Simplest way for the lambda function to report errors.
// It cannot throw a ParseException, and it cannot modify local variables.
// It _can_ modify what this local variable points to.
// Below, we will throw a ParseException for the first bad position.
final List<Integer> badEscapePositions = new ArrayList<>();
unicodeSetExpression =
matcher.replaceAll(
(MatchResult match) -> {
UnicodeSet character =
new UnicodeSet(
match.group(), new ParsePosition(0), symbolTable);
if (character.isEmpty()) {
badEscapePositions.add(match.start());
return "";
}
return Strings.padStart(
"\\\\x{" + Integer.toHexString(character.charAt(0)) + "}",
match.group().length() + 1,
' ');
});
for (int p : badEscapePositions) {
// Simplest way to throw an exception for only the first list element.
throw new ParseException("No character matching \\N escape", initialPosition + p);
}
var patchedParsePosition = new ParsePosition(0);
try {
return new UnicodeSet(unicodeSetExpression, patchedParsePosition, symbolTable);
} catch (IllegalArgumentException e) {
final String message = e.getMessage().split(" at \"", 2)[0];
throw new BackwardParseException(
message, patchedParsePosition.getIndex() + initialPosition);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -311,7 +311,7 @@ In \P{U-1:gc=Cn}, R-1:NFKC_Simple_Casefold = NFKC_Simple_Casefold
# Red Flag: cased and case_ignorable should be disjoint,
# except for modifier letters and ◌ͅ.

\p{Cased} ∥ [\p{Case_Ignorable} - \p{gc=Lm} - \N{COMBINING GREEK YPOGEGRAMMENI} ]
\p{Cased} ∥ [\p{Case_Ignorable} - \p{gc=Lm} - [\N{COMBINING GREEK YPOGEGRAMMENI}] ]

##########################
# Property Stability Policies
Expand Down Expand Up @@ -837,8 +837,8 @@ Let $QUInclusions := [\u275F-\u2760 \U0001F676-\U0001F678 \u0022 \u0027 \u275B-\
# 7.0 added 275F..2760 1F676..1F678
\p{LB=QU} = [\p{GC=Pf} \p{GC=Pi} $QUInclusions]
\p{LB=SG} = \p{GC=Cs}
\p{LB=SP} = \N{SPACE}
\p{LB=SY} = \N{SOLIDUS}
\p{LB=SP} = [\N{SPACE}]
\p{LB=SY} = [\N{SOLIDUS}]

# The classes WJ and ZW are reserved for characters meant as explicit overrides
# to the line breaking algorithms. For historical reasons, WJ has two such
Expand All @@ -848,7 +848,7 @@ Let $QUInclusions := [\u275F-\u2760 \U0001F676-\U0001F678 \u0022 \u0027 \u275B-\
# behaviour requires the prohibition of line breaks on either side should have
# class GL.
\p{LB=WJ} = [\N{WORD JOINER} \N{ZERO WIDTH NO-BREAK SPACE}]
\p{LB=ZW} = \N{ZERO WIDTH SPACE}
\p{LB=ZW} = [\N{ZERO WIDTH SPACE}]

\p{LB=ZWJ} = [\u200D]
\p{LB=RI} = \p{RI=Y}
Expand Down Expand Up @@ -1249,7 +1249,7 @@ Let $cjkStrokes := \p{Name=/^CJK STROKE /}
Let $kangxiRadicals := \p{Name=/^KANGXI RADICAL /}
Let $cjkRadicals := \p{Name=/^CJK RADICAL /}
Let $strokesAndRadicals := [ $cjkStrokes $kangxiRadicals $cjkRadicals ]
Let $nonIdeographicRadicals := \N{CJK RADICAL REPEAT}
Let $nonIdeographicRadicals := [\N{CJK RADICAL REPEAT}]
# The following set may expand over time, if strokes are added.
# It can also shrink, if single-stroke ideographs are encoded.
Let $nonIdeographicStrokes := \p{Name=/^CJK STROKE (T|WG|XG|BXG|SW|HZZ|HP|HZWG|SZWG|HZT|HZZP|HPWG|HZW|HZZZ|PG|Q|HXG|SZP)$/}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,13 @@
import static org.junit.jupiter.api.Assertions.assertAll;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertThrows;

import java.io.IOException;
import java.text.ParseException;
import java.text.ParsePosition;
import org.junit.jupiter.api.Test;
import org.unicode.text.UCD.TestUnicodeInvariants.BackwardParseException;
import org.unicode.text.utility.Settings;

public class TestTestUnicodeInvariants {
Expand Down Expand Up @@ -49,4 +53,31 @@ void testSecurityInvariants() throws IOException {
TestUnicodeInvariants.testInvariants("SecurityInvariantTest.txt", "security", true);
assertEquals(0, rc, "TestUnicodeInvariants.testInvariants(security) failed");
}

@Test
void testUnicodeSetParsing() throws ParseException {
assertEquals(
26,
TestUnicodeInvariants.parseUnicodeSet(
"TEST [\\N{LATIN SMALL LETTER A}-\\N{LATIN SMALL LETTER Z}]",
new ParsePosition(5))
.size());
ParseException thrown =
assertThrows(
ParseException.class,
() ->
TestUnicodeInvariants.parseUnicodeSet(
"TEST [\\N{MEOW}]", new ParsePosition(5)));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i see a character encoding proposal brewing ;-)

assertEquals("No character matching \\N escape", thrown.getMessage());
assertEquals("TEST [".length(), thrown.getErrorOffset());
thrown =
assertThrows(
BackwardParseException.class,
() ->
TestUnicodeInvariants.parseUnicodeSet(
"TEST [[a-z]-\\N{LATIN SMALL LETTER Z}]",
new ParsePosition(5)));
assertEquals("Error: Set expected after operator", thrown.getMessage());
assertEquals("TEST [[a-z]-.N{LATIN SMALL LETTER Z}".length(), thrown.getErrorOffset());
}
}
Loading