Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Work around a UnicodeSet bug #908

Merged
merged 7 commits into from
Aug 9, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import java.util.Stack;
import java.util.TreeMap;
import java.util.function.Function;
import java.util.regex.MatchResult;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.unicode.cldr.draft.FileUtilities;
Expand Down Expand Up @@ -2007,17 +2008,58 @@ public BackwardParseException(String s, int errorOffset) {
}
}

private static Pattern nameEscape = Pattern.compile("\\\\N\\{[^}]*\\}");

public static UnicodeSet parseUnicodeSet(String source, ParsePosition pp)
throws ParseException {
final int initialPosition = pp.getIndex();
UnicodeSet icuSet;
try {
final var result = new UnicodeSet(source, pp, symbolTable);
return result;
// Let ICU figure out where the UnicodeSet expression ends.
icuSet = new UnicodeSet(source, pp, symbolTable);
} catch (IllegalArgumentException e) {
// ICU produces unhelpful messages when parsing UnicodeSet deep into
// a large string in a string that contains line terminators, as the
// whole string is escaped and printed.
final String message = e.getMessage().split(" at \"", 2)[0];
throw new BackwardParseException(message, pp.getIndex());
}
String unicodeSetExpression = source.substring(initialPosition, pp.getIndex());
// ICU incorrectly treats \N{X} as a synonym for \p{Name=X}, returning a
// set rather than a character, so that it can be empty, and so that
// \N{X}-\N{Y} is a set difference (equal to \N{X}) rather than the range \N{X}-\N{Y}.
// This should likely be fixed in ICU, but in the meantime we need to work around it in
// the invariant before someone gets hurt.
var matcher = nameEscape.matcher(unicodeSetExpression);
if (!matcher.find()) {
return icuSet;
}
List<Integer> badEscapePositions = new ArrayList<>();
eggrobin marked this conversation as resolved.
Show resolved Hide resolved
unicodeSetExpression =
matcher.replaceAll(
(MatchResult match) -> {
UnicodeSet character =
new UnicodeSet(
match.group(), new ParsePosition(0), symbolTable);
if (character.isEmpty()) {
badEscapePositions.add(match.start());
return "";
}
return String.format(
"%" + (match.group().length() + 1) + "s",
"\\\\x{" + Integer.toHexString(character.charAt(0)) + "}");
eggrobin marked this conversation as resolved.
Show resolved Hide resolved
});
for (int p : badEscapePositions) {
throw new ParseException("No character matching \\N escape", initialPosition + p);
}
markusicu marked this conversation as resolved.
Show resolved Hide resolved
eggrobin marked this conversation as resolved.
Show resolved Hide resolved
System.err.println(unicodeSetExpression);
eggrobin marked this conversation as resolved.
Show resolved Hide resolved
var patchedParsePosition = new ParsePosition(0);
try {
return new UnicodeSet(unicodeSetExpression, patchedParsePosition, symbolTable);
} catch (IllegalArgumentException e) {
final String message = e.getMessage().split(" at \"", 2)[0];
throw new BackwardParseException(
message, patchedParsePosition.getIndex() + initialPosition);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,13 @@
import static org.junit.jupiter.api.Assertions.assertAll;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertThrows;

import java.io.IOException;
import java.text.ParseException;
import java.text.ParsePosition;
import org.junit.jupiter.api.Test;
import org.unicode.text.UCD.TestUnicodeInvariants.BackwardParseException;
import org.unicode.text.utility.Settings;

public class TestTestUnicodeInvariants {
Expand Down Expand Up @@ -49,4 +53,31 @@ void testSecurityInvariants() throws IOException {
TestUnicodeInvariants.testInvariants("SecurityInvariantTest.txt", "security", true);
assertEquals(0, rc, "TestUnicodeInvariants.testInvariants(security) failed");
}

@Test
void testUnicodeSetParsing() throws ParseException {
assertEquals(
26,
TestUnicodeInvariants.parseUnicodeSet(
"TEST [\\N{LATIN SMALL LETTER A}-\\N{LATIN SMALL LETTER Z}]",
new ParsePosition(5))
.size());
ParseException thrown =
assertThrows(
ParseException.class,
() ->
TestUnicodeInvariants.parseUnicodeSet(
"TEST [\\N{MEOW}]", new ParsePosition(5)));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i see a character encoding proposal brewing ;-)

assertEquals("No character matching \\N escape", thrown.getMessage());
assertEquals("TEST [".length(), thrown.getErrorOffset());
thrown =
assertThrows(
BackwardParseException.class,
() ->
TestUnicodeInvariants.parseUnicodeSet(
"TEST [[a-z]-\\N{LATIN SMALL LETTER Z}]",
new ParsePosition(5)));
assertEquals("Error: Set expected after operator", thrown.getMessage());
assertEquals("TEST [[a-z]-.N{LATIN SMALL LETTER Z}".length(), thrown.getErrorOffset());
}
}
Loading