diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java index 320b7d120..d809a1659 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java @@ -121,7 +121,6 @@ enum Expected { }; static final UnicodeSet INVARIANT_RELATIONS = new UnicodeSet("[=\u2282\u2283\u2286\u2287∥≉]"); - static final ParsePosition pp = new ParsePosition(0); private static PrintWriter out; @@ -179,102 +178,138 @@ public static int testInvariants(String inputFile, String suffix, boolean doRang } else { out3.write('\uFEFF'); // BOM } + final var noComments = new StringBuilder(); + final List lines = new ArrayList<>(); + final List lineBeginnings = new ArrayList(); try (final BufferedReader in = getInputReader(inputFile)) { - errorLister = - new BagFormatter() - .setMergeRanges(doRange) - .setLabelSource(null) - .setUnicodePropertyFactory(LATEST_PROPS) - // .setTableHtml("") - .setShowLiteral(toHTML) - .setFixName(toHTML); - errorLister.setShowTotal(false); - if (doHtml) { - errorLister.setTabber(htmlTabber); - } + in.lines() + .forEach( + line -> { + if (line.startsWith("\uFEFF")) { + line = line.substring(1); + } + lines.add(line); + lineBeginnings.add(noComments.length()); + final int pos = line.indexOf('#'); + if (pos >= 0) { + line = line.substring(0, pos); + } + noComments.append(line.trim() + '\n'); + }); + } + errorLister = + new BagFormatter() + .setMergeRanges(doRange) + .setLabelSource(null) + .setUnicodePropertyFactory(LATEST_PROPS) + // .setTableHtml("
") + .setShowLiteral(toHTML) + .setFixName(toHTML); + errorLister.setShowTotal(false); + if (doHtml) { + errorLister.setTabber(htmlTabber); + } + + showLister = + new BagFormatter() + .setMergeRanges(doRange) + // .setLabelSource(null) + .setUnicodePropertyFactory(LATEST_PROPS) + // .setTableHtml("
") + .setShowLiteral(toHTML); + showLister.setShowTotal(false); + if (showScript) { + showLister.setValueSource(LATEST_PROPS.getProperty("script")); + } + if (doHtml) { + showLister.setTabber(htmlTabber); + } - showLister = - new BagFormatter() - .setMergeRanges(doRange) - // .setLabelSource(null) - .setUnicodePropertyFactory(LATEST_PROPS) - // .setTableHtml("
") - .setShowLiteral(toHTML); - showLister.setShowTotal(false); - if (showScript) { - showLister.setValueSource(LATEST_PROPS.getProperty("script")); + final String source = noComments.toString(); + final Function getLineNumber = + position -> { + for (int i = 0; i < lineBeginnings.size(); ++i) { + if (lineBeginnings.get(i) > position.getIndex()) { + return i; // 1-based line number. + } + } + return lineBeginnings.size(); + }; + int lastPrintedLine = 0; + final ParsePosition pp = new ParsePosition(0); + boolean followingParseError = false; + for (; ; ) { + final int statementStart = pp.getIndex(); + final int statementLineNumber = getLineNumber.apply(pp); + final var nextToken = Lookahead.oneToken(pp, source); + while (lastPrintedLine < statementLineNumber) { + println(lines.get(lastPrintedLine++)); } - if (doHtml) { - showLister.setTabber(htmlTabber); + if (nextToken == null) { + break; } - - // symbolTable = new ChainedSymbolTable(); - // new ChainedSymbolTable(new SymbolTable[] { - // - // ToolUnicodePropertySource.make(UCD.lastVersion).getSymbolTable("\u00D7"), - // - // ToolUnicodePropertySource.make(Default.ucdVersion()).getSymbolTable("")}); - for (int lineNumber = 1; ; ++lineNumber) { - String line = in.readLine(); - if (line == null) { - break; + try { + if (nextToken.accept("Let")) { + letLine(pp, source); + } else if (nextToken.accept("In")) { + inLine(pp, source, inputFile, getLineNumber); + } else if (nextToken.accept("Propertywise")) { + propertywiseLine(pp, source, inputFile, getLineNumber); + } else if (nextToken.accept("Map")) { + testMapLine(source, pp, getLineNumber); + } else if (nextToken.accept("ShowMap")) { + showMapLine(source, pp); + } else if (nextToken.accept("Show")) { + showLine(source, pp); + } else if (nextToken.accept("OnPairsOf")) { + equivalencesLine(source, pp, inputFile, getLineNumber); + } else { + pp.setIndex(statementStart); + testLine(source, pp, inputFile, getLineNumber); } - try { - if (line.startsWith("\uFEFF")) { - line = line.substring(1); + followingParseError = false; + } catch (final Exception e) { + if (!followingParseError) { + final int lineNumber = getLineNumber.apply(pp); + while (lineNumber > lastPrintedLine) { + println(lines.get(lastPrintedLine++)); } - println(line); - line = line.trim(); - final int pos = line.indexOf('#'); - if (pos >= 0) { - line = line.substring(0, pos).trim(); - } - if (line.length() == 0) { - continue; - } - if (line.equalsIgnoreCase("Stop")) { - break; - } else if (line.startsWith("Let")) { - letLine(pp, line); - } else if (line.startsWith("In")) { - inLine(pp, line, inputFile, lineNumber); - } else if (line.startsWith("Propertywise")) { - propertywiseLine(pp, line, inputFile, lineNumber); - } else if (line.startsWith("ShowScript")) { - showScript = true; - } else if (line.startsWith("HideScript")) { - showScript = false; - } else if (line.startsWith("Map")) { - testMapLine(line, pp, lineNumber); - } else if (line.startsWith("ShowMap")) { - showMapLine(line, pp); - } else if (line.startsWith("Show")) { - showLine(line, pp); - } else if (line.startsWith("OnPairsOf")) { - equivalencesLine(line, pp, inputFile, lineNumber); - } else { - testLine(line, pp, inputFile, lineNumber); - } - } catch (final Exception e) { parseErrorCount = - parseError(parseErrorCount, line, e, inputFile, lineNumber); + parseError( + parseErrorCount, + source, + e, + statementStart, + inputFile, + getLineNumber.apply(pp)); + } + // Give up on the whole line, it is unlikely to contain anything we can + // parse. + // Try parsing the next line, but since that may be the rest of what we + // failed to parse, + // do not report errors until we successfully parse *something*. + final int nextLine = source.indexOf("\n", pp.getIndex()); + if (nextLine >= 0) { + pp.setIndex(source.indexOf("\n", pp.getIndex())); + followingParseError = true; continue; + } else { + break; } } - println(); - println("**** SUMMARY ****"); - println(); - println("# ParseErrorCount=" + parseErrorCount); - System.out.println("ParseErrorCount=" + parseErrorCount); - println("# TestFailureCount=" + testFailureCount); - System.out.println("TestFailureCount=" + testFailureCount); - if (doHtml) { - out3.println(""); - } - out2.append(writer.getBuffer()); } + println(); + println("**** SUMMARY ****"); + println(); + println("# ParseErrorCount=" + parseErrorCount); + System.out.println("ParseErrorCount=" + parseErrorCount); + println("# TestFailureCount=" + testFailureCount); + System.out.println("TestFailureCount=" + testFailureCount); + if (doHtml) { + out3.println(""); + } + out2.append(writer.getBuffer()); } - out = null; } return parseErrorCount + testFailureCount; } @@ -332,18 +367,20 @@ protected String getFailure(int codepoint) { } } - private static void propertywiseLine(ParsePosition pp, String line, String file, int lineNumber) + private static void propertywiseLine( + ParsePosition pp, + String line, + String file, + Function getLineNumber) throws ParseException { - pp.setIndex("Propertywise".length()); - final UnicodeSet set = new UnicodeSet(line, pp, symbolTable); + final UnicodeSet set = parseUnicodeSet(line, pp); if (set.hasStrings()) { - throw new ParseException( + throw new BackwardParseException( "Set should contain only single code points for property comparison", pp.getIndex()); } expectToken("AreAlike", pp, line); - if (pp.getIndex() < line.length()) { - expectToken(",", pp, line); + if (Lookahead.oneToken(pp, line).accept(",")) { expectToken("Except", pp, line); expectToken(":", pp, line); } @@ -398,7 +435,9 @@ private static void propertywiseLine(ParsePosition pp, String line, String file, testFailureCount++; printErrorLine("Test Failure", Side.START, testFailureCount); reportTestFailure( - file, lineNumber, String.join("\n", errorMessageLines).replace('\t', ' ')); + file, + getLineNumber.apply(pp), + String.join("\n", errorMessageLines).replace('\t', ' ')); out.println("
"); for (String errorMessageLine : errorMessageLines) { out.println("
"); @@ -410,10 +449,13 @@ private static void propertywiseLine(ParsePosition pp, String line, String file, } } - private static void equivalencesLine(String line, ParsePosition pp, String file, int lineNumber) + private static void equivalencesLine( + String line, + ParsePosition pp, + String file, + Function getLineNumber) throws ParseException { - pp.setIndex("OnPairsOf".length()); - final UnicodeSet domain = new UnicodeSet(line, pp, symbolTable); + final UnicodeSet domain = parseUnicodeSet(line, pp); expectToken(",", pp, line); expectToken("EqualityOf", pp, line); final var leftProperty = CompoundProperty.of(LATEST_PROPS, line, pp); @@ -592,7 +634,9 @@ private static void equivalencesLine(String line, ParsePosition pp, String file, errorMessageLines.addAll(counterexamples); if (failure) { reportTestFailure( - file, lineNumber, String.join("\n", errorMessageLines).replace('\t', ' ')); + file, + getLineNumber.apply(pp), + String.join("\n", errorMessageLines).replace('\t', ' ')); } out.println(failure ? "" : "
"); for (String counterexample : counterexamples) { @@ -606,9 +650,12 @@ private static void equivalencesLine(String line, ParsePosition pp, String file, } } - private static void inLine(ParsePosition pp, String line, String file, int lineNumber) + private static void inLine( + ParsePosition pp, + String line, + String file, + Function getLineNumber) throws ParseException { - pp.setIndex(2); final PropertyPredicate propertyPredicate = getPropertyPredicate(pp, line); final UnicodeMap failures = propertyPredicate.getFailures(); final UnicodeSet failureSet = failures.keySet(); @@ -627,7 +674,8 @@ private static void inLine(ParsePosition pp, String line, String file, int lineN errorLister.setLineSeparator("\n"); errorLister.showSetNames(new PrintWriter(monoTable), failureSet); errorLister.setTabber(htmlTabber); - reportTestFailure(file, lineNumber, errorMessage + "\n" + monoTable.toString()); + reportTestFailure( + file, getLineNumber.apply(pp), errorMessage + "\n" + monoTable.toString()); if (doHtml) { out.println("
"); @@ -642,21 +690,74 @@ private static void inLine(ParsePosition pp, String line, String file, int lineN } } - private static void expectToken(String token, ParsePosition pp, String line) + // A one-token lookahead. + // Tokens are defined as runs of [^\p{Pattern_White_Space}\p{Pattern_Syntax}], + // or single code points in \p{Pattern_Syntax}. + private static class Lookahead { + // Advances pp through any pattern white space, then looks ahead one token. + public static Lookahead oneToken(ParsePosition pp, String text) { + scan(PATTERN_WHITE_SPACE, text, pp, true); + return oneTokenNoSpace(pp, text); + } + + // Returns null if pp is before pattern white space; otherwise, looks ahead one token. + public static Lookahead oneTokenNoSpace(ParsePosition pp, String text) { + ParsePosition next = new ParsePosition(pp.getIndex()); + if (next.getIndex() == text.length()) { + return null; + } + int start = next.getIndex(); + if (PATTERN_SYNTAX.contains(text.codePointAt(start))) { + final String result = Character.toString(text.codePointAt(start)); + next.setIndex(start + result.length()); + return new Lookahead(result, pp, next); + } else { + final String result = scan(PATTERN_SYNTAX_OR_WHITE_SPACE, text, next, false); + return result.isEmpty() ? null : new Lookahead(result, pp, next); + } + } + + private Lookahead(String token, ParsePosition pp, ParsePosition next) { + this.token = token; + this.pp = pp; + this.next = next; + } + + // Advances the ParsePosition passed at construction past the token, and returns the token. + public String consume() { + pp.setIndex(next.getIndex()); + return token; + } + + // If this token is expected, advances the ParsePosition passed at construction past the + // token past it and returns true. + // Otherwise, this function no effect and returns false. + public boolean accept(String expected) { + if (expected.equals(token)) { + consume(); + return true; + } else { + return false; + } + } + + private final String token; + private final ParsePosition pp; + private final ParsePosition next; + } + + private static void expectToken(String token, ParsePosition pp, String text) throws ParseException { - scan(PATTERN_WHITE_SPACE, line, pp, true); - if (!line.substring(pp.getIndex()).startsWith(token)) { - throw new ParseException("Expected " + token, pp.getIndex()); + if (!Lookahead.oneToken(pp, text).accept(token)) { + throw new ParseException("Expected '" + token + "'", pp.getIndex()); } - pp.setIndex(pp.getIndex() + token.length()); - scan(PATTERN_WHITE_SPACE, line, pp, true); } private static PropertyPredicate getPropertyPredicate(ParsePosition pp, String line) throws ParseException { PropertyPredicate predicate; - final UnicodeSet valueSet = new UnicodeSet(line, pp, symbolTable); + final UnicodeSet valueSet = parseUnicodeSet(line, pp); expectToken(",", pp, line); final UnicodeProperty property1 = CompoundProperty.of(LATEST_PROPS, line, pp); final int cp = line.codePointAt(pp.getIndex()); @@ -674,7 +775,7 @@ private static PropertyPredicate getPropertyPredicate(ParsePosition pp, String l final var containment = new PropertyValueContainment(); containment.shouldBeInSet = cp == '∈'; pp.setIndex(pp.getIndex() + 1); - containment.set = new UnicodeSet(line, pp, symbolTable); + containment.set = parseUnicodeSet(line, pp); predicate = containment; break; default: @@ -683,9 +784,6 @@ private static PropertyPredicate getPropertyPredicate(ParsePosition pp, String l predicate.valueSet = valueSet; predicate.property1 = property1; scan(PATTERN_WHITE_SPACE, line, pp, true); - if (pp.getIndex() != line.length()) { - throw new ParseException(line, pp.getIndex()); - } return predicate; } @@ -705,29 +803,33 @@ enum Type { private Function, String> sequenceReduction; } - private static final UnicodeSet PROPCHARS = - new UnicodeSet("[a-zA-Z0-9.\\:\\-\\_\\u0020\\p{pattern white space}]"); + // TODO(egg): Consider bringing back Pattern_White_Space if requiring semicolons. + private static final UnicodeSet PROPCHARS = new UnicodeSet("[a-zA-Z0-9.\\:\\-\\_\\u0020}]"); private final List propOrFilters = new ArrayList(); static UnicodeProperty of( - UnicodeProperty.Factory propSource, String line, ParsePosition pp) { + UnicodeProperty.Factory propSource, String source, ParsePosition pp) + throws ParseException { final CompoundProperty result = new CompoundProperty(); while (true) { - scan(PATTERN_WHITE_SPACE, line, pp, true); - if (UnicodeSet.resemblesPattern(line, pp.getIndex())) { + scan(PATTERN_WHITE_SPACE, source, pp, true); + if (UnicodeSet.resemblesPattern(source, pp.getIndex())) { final FilterOrProp propOrFilter = new FilterOrProp(); - propOrFilter.filter = parseUnicodeSet(line, pp); + propOrFilter.filter = parseUnicodeSet(source, pp); propOrFilter.type = FilterOrProp.Type.filter; result.propOrFilters.add(propOrFilter); - } else if (line.charAt(pp.getIndex()) == '(') { + } else if (source.charAt(pp.getIndex()) == '(') { final FilterOrProp propOrFilter = new FilterOrProp(); final var matcher = - Pattern.compile("(\\( *([^ )]+)(?: +([^)]+))? *\\)).*") - .matcher(line.substring(pp.getIndex())); + Pattern.compile("(\\( *([^ )]+)(?: +([^)]+))? *\\)).*", Pattern.DOTALL) + .matcher(source.subSequence(pp.getIndex(), source.length())); if (!matcher.matches()) { throw new IllegalArgumentException( "Expected ( ), got " - + line.substring(pp.getIndex())); + + source.substring( + pp.getIndex(), + Math.min(pp.getIndex() + 50, source.length())) + + "…"); } propOrFilter.type = FilterOrProp.Type.sequenceTransformation; final String expression = matcher.group(1); @@ -800,7 +902,7 @@ static UnicodeProperty of( result.propOrFilters.add(propOrFilter); pp.setIndex(pp.getIndex() + expression.length()); } else { - final String propName = scan(PROPCHARS, line, pp, true); + final String propName = scan(PROPCHARS, source, pp, true); if (propName.length() > 0) { final FilterOrProp propOrFilter = new FilterOrProp(); final VersionedProperty xprop = @@ -821,12 +923,12 @@ static UnicodeProperty of( break; } } - scan(PATTERN_WHITE_SPACE, line, pp, true); + scan(PATTERN_WHITE_SPACE, source, pp, true); final int pos = pp.getIndex(); - if (pos == line.length()) { + if (pos == source.length()) { break; } - final int cp = line.charAt(pos); + final int cp = source.charAt(pos); if (cp != '*') { break; } @@ -964,37 +1066,32 @@ protected String _getVersion() { } } - private static void letLine(ParsePosition pp, String line) { - final int x = line.indexOf('='); - final String variable = line.substring(3, x).trim(); - if (!variable.startsWith("$")) { - throw new IllegalArgumentException("Variable must begin with '$': "); - } - final String value = line.substring(x + 1).trim(); - pp.setIndex(0); - final UnicodeSet valueSet = new UnicodeSet("[" + value + "]", pp, symbolTable); + private static void letLine(ParsePosition pp, String source) throws ParseException { + expectToken("$", pp, source); + final String variable = Lookahead.oneTokenNoSpace(pp, source).consume(); + expectToken("=", pp, source); + final int valueStart = pp.getIndex(); + final UnicodeSet valueSet = parseUnicodeSet(source, pp); valueSet.complement().complement(); - symbolTable.add(variable.substring(1), valueSet.toPattern(false)); + symbolTable.add(variable, valueSet.toPattern(false)); + final String value = source.substring(valueStart, pp.getIndex()); if (DEBUG) { System.out.println("Added variable: <" + variable + "><" + value + ">"); } - showSet(pp, value); + showSet(new ParsePosition(0), value); } - private static void showLine(String line, ParsePosition pp) { - String part = line.substring(4).trim(); - if (part.startsWith("Each")) { - part = part.substring(4).trim(); + private static void showLine(String source, ParsePosition pp) throws ParseException { + if (Lookahead.oneToken(pp, source).accept("Each")) { showLister.setMergeRanges(false); } - showSet(pp, part); + showSet(pp, source); showLister.setMergeRanges(doRange); } private static void showMapLine(String line, ParsePosition pp) { String part = line.substring(7).trim(); - pp.setIndex(0); pp.setErrorIndex(-1); if (part.startsWith("Each")) { part = part.substring(4).trim(); @@ -1009,33 +1106,28 @@ private static void showMapLine(String line, ParsePosition pp) { showLister.setMergeRanges(doRange); } - private static void testLine(String line, ParsePosition pp, String file, int lineNumber) + private static void testLine( + String source, + ParsePosition pp, + String file, + Function getLineNumber) throws ParseException { - if (line.startsWith("Test")) { - line = line.substring(4).trim(); - } - char relation = 0; String rightSide = null; String leftSide = null; UnicodeSet leftSet = null; UnicodeSet rightSet = null; - pp.setIndex(0); - leftSet = new UnicodeSet(line, pp, symbolTable); - leftSide = line.substring(0, pp.getIndex()); - scan(PATTERN_WHITE_SPACE, line, pp, true); - relation = line.charAt(pp.getIndex()); + final int leftStart = pp.getIndex(); + leftSet = parseUnicodeSet(source, pp); + leftSide = source.substring(leftStart, pp.getIndex()); + scan(PATTERN_WHITE_SPACE, source, pp, true); + relation = source.charAt(pp.getIndex()); checkRelation(pp, relation); pp.setIndex(pp.getIndex() + 1); // skip char - scan(PATTERN_WHITE_SPACE, line, pp, true); - final int start = pp.getIndex(); - rightSet = new UnicodeSet(line, pp, symbolTable); - rightSide = line.substring(start, pp.getIndex()); - scan(PATTERN_WHITE_SPACE, line, pp, true); - if (line.length() != pp.getIndex()) { - throw new ParseException("Extra characters at end", pp.getIndex()); - } + final int rightStart = pp.getIndex(); + rightSet = parseUnicodeSet(source, pp); + rightSide = source.substring(rightStart, pp.getIndex()); Expected right_left = Expected.irrelevant; Expected rightAndLeft = Expected.irrelevant; @@ -1078,7 +1170,7 @@ private static void testLine(String line, ParsePosition pp, String file, int lin "But Not In", leftSide, file, - lineNumber); + getLineNumber.apply(pp)); checkExpected( rightAndLeft, new UnicodeSet(rightSet).retainAll(leftSet), @@ -1087,7 +1179,7 @@ private static void testLine(String line, ParsePosition pp, String file, int lin "And In", leftSide, file, - lineNumber); + getLineNumber.apply(pp)); checkExpected( left_right, new UnicodeSet(leftSet).removeAll(rightSet), @@ -1096,7 +1188,7 @@ private static void testLine(String line, ParsePosition pp, String file, int lin "But Not In", rightSide, file, - lineNumber); + getLineNumber.apply(pp)); } public static void checkRelation(ParsePosition pp, char relation) throws ParseException { @@ -1176,7 +1268,8 @@ private static void checkExpected( getProperties(Settings.lastVersion), IndexUnicodeProperties.make(Settings.lastVersion))); - private static void testMapLine(String line, ParsePosition pp, int lineNumber) + private static void testMapLine( + String line, ParsePosition pp, Function getLineNumber) throws ParseException { char relation = 0; String rightSide = null; @@ -1184,7 +1277,6 @@ private static void testMapLine(String line, ParsePosition pp, int lineNumber) UnicodeMap leftSet = null; UnicodeMap rightSet = null; - pp.setIndex(3); leftSet = UMP.parse(line, pp); leftSide = line.substring(3, pp.getIndex()); scan(PATTERN_WHITE_SPACE, line, pp, true); @@ -1240,7 +1332,7 @@ private static void testMapLine(String line, ParsePosition pp, int lineNumber) rightSide, "But Not In", leftSide, - lineNumber); + getLineNumber.apply(pp)); checkExpected( rightAndLeft, UnicodeMapParser.retainAll(new UnicodeMap().putAll(rightSet), leftSet), @@ -1248,7 +1340,7 @@ private static void testMapLine(String line, ParsePosition pp, int lineNumber) rightSide, "And In", leftSide, - lineNumber); + getLineNumber.apply(pp)); checkExpected( left_right, UnicodeMapParser.removeAll(new UnicodeMap().putAll(leftSet), rightSet), @@ -1256,7 +1348,7 @@ private static void testMapLine(String line, ParsePosition pp, int lineNumber) leftSide, "But Not In", rightSide, - lineNumber); + getLineNumber.apply(pp)); } private static void checkExpected( @@ -1302,9 +1394,8 @@ private static void checkExpected( nf.setGroupingUsed(true); } - private static void showSet(ParsePosition pp, final String value) { - pp.setIndex(0); - UnicodeSet valueSet = new UnicodeSet(value, pp, symbolTable); + private static void showSet(ParsePosition pp, final String value) throws ParseException { + UnicodeSet valueSet = parseUnicodeSet(value, pp); final int totalSize = valueSet.size(); int abbreviated = 0; if (showRangeLimit >= 0) { @@ -1344,21 +1435,34 @@ private static void showSet(ParsePosition pp, final String value) { } private static int parseError( - int parseErrorCount, String line, Exception e, String file, int lineNumber) { + int parseErrorCount, + String source, + Exception e, + int statementStart, + String file, + int lineNumber) { parseErrorCount++; if (e instanceof ParseException) { final int index = ((ParseException) e).getErrorOffset(); - line = line.substring(0, index) + "☞" + line.substring(index); + final int eol = source.indexOf("\n", index); + source = + source.substring(statementStart, index) + + (e instanceof BackwardParseException ? "☜" : "☞") + + source.substring(index, eol >= 0 ? eol : source.length()); + } else { + final int sol = source.lastIndexOf("\n", statementStart); + final int eol = source.indexOf("\n", statementStart); + source = source.substring(sol >= 0 ? sol : 0, eol >= 0 ? eol : source.length()); } printErrorLine("Parse Failure", Side.START, parseErrorCount); - println("**** PARSE ERROR:\t" + line); + println("**** PARSE ERROR:\t" + source); out.println("
");
         final String message = e.getMessage();
         if (message != null) {
             println("##" + message);
         }
-        reportParseError(file, lineNumber, message);
+        reportParseError(file, lineNumber, message + "\n" + source);
         e.printStackTrace(out);
 
         out.println("
"); @@ -1394,12 +1498,17 @@ private static void printErrorLine(String title, Side side, int testFailureCount private static final String HTML_RULES_CONTROLS = HTML_RULES - + ":: [[:C:][:Z:][:whitespace:][:Default_Ignorable_Code_Point:] - [\\u0020\\u0009]] hex/unicode ; "; + + ":: [[:C:][:Z:][:whitespace:][:Default_Ignorable_Code_Point:] - [\\u0020\\u0009\\u000A]] hex/unicode ; " + + "\\u000A > '
'"; public static final Transliterator toHTMLControl = Transliterator.createFromRules("any-html", HTML_RULES_CONTROLS, Transliterator.FORWARD); private static final UnicodeSet PATTERN_WHITE_SPACE = new UnicodeSet("\\p{pattern white space}").freeze(); + private static final UnicodeSet PATTERN_SYNTAX = new UnicodeSet("\\p{pattern syntax}").freeze(); + private static final UnicodeSet PATTERN_SYNTAX_OR_WHITE_SPACE = + new UnicodeSet("[\\p{pattern white space}\\p{pattern syntax}]").freeze(); + private static int testFailureCount; private static int parseErrorCount; private static BagFormatter errorLister; @@ -1518,7 +1627,7 @@ public int compare(String o1, String o2) { public void add(String variable, String value) { if (variables.containsKey(variable)) { - throw new IllegalArgumentException("Attempt to reset variable"); + throw new IllegalArgumentException("Attempt to reset variable " + variable); } variables.put(variable, value.toCharArray()); } @@ -1576,7 +1685,26 @@ public boolean applyPropertyAlias( } } - public static UnicodeSet parseUnicodeSet(String line, ParsePosition pp) { - return new UnicodeSet(line, pp, symbolTable); + // Some of our parse exceptions are thrown with a parse position before the problem. + // However, others are thrown with the parse position after the problem, so the message must be + // adjusted accordingly. + public static class BackwardParseException extends ParseException { + public BackwardParseException(String s, int errorOffset) { + super(s, errorOffset); + } + } + + public static UnicodeSet parseUnicodeSet(String source, ParsePosition pp) + throws ParseException { + try { + final var result = new UnicodeSet(source, pp, symbolTable); + return result; + } catch (IllegalArgumentException e) { + // ICU produces unhelpful messages when parsing UnicodeSet deep into + // a large string in a string that contains line terminators, as the + // whole string is escaped and printed. + final String message = e.getMessage().split(" at \"", 2)[0]; + throw new BackwardParseException(message, pp.getIndex()); + } } } diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt index 65d0004eb..9f684144f 100644 --- a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt +++ b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt @@ -170,7 +170,15 @@ Let $fii = \p{toNFD=/$foo/} Let $codepoints = [\u0000-\U0010FFFF] Let $gcAllPunctuation = \p{gc=/_Punctuation/} -$gcAllPunctuation = [\p{gc=Close_Punctuation}\p{gc=Connector_Punctuation}}\p{gc=Dash_Punctuation}\p{gc=Final_Punctuation}\p{gc=Initial_Punctuation}\p{gc=Open_Punctuation}\p{gc=Other_Punctuation}] +$gcAllPunctuation = [ + \p{gc=Close_Punctuation} + \p{gc=Connector_Punctuation} + \p{gc=Dash_Punctuation} + \p{gc=Final_Punctuation} + \p{gc=Initial_Punctuation} + \p{gc=Open_Punctuation} + \p{gc=Other_Punctuation} +] Let $gcAllSymbols = \p{gc=/_Symbol/} $gcAllSymbols = [\p{gc=Math_Symbol}\p{gc=Currency_Symbol}\p{gc=Modifier_Symbol}\p{gc=Other_Symbol}] @@ -269,13 +277,34 @@ Let $BMExclusions = [ ≠ ∤ ∦ ≢ ≭ ⫝̸ ] In [\p{dt=canonical}-$BMExclusions], (delete-adjacent-duplicates) * Bidi_M * \P{bc=NSM} * dm = Bidi_M * \P{bc=NSM} # Additional BIDI invariant constants -Let $AL_blocks = [\u0600-\u07BF \u0860-\u08FF \uFB50-\uFDCF \uFDF0-\uFDFF \uFE70-\uFEFF \U00010D00-\U00010D3F \U00010EC0-\U00010EFF \U00010F30-\U00010F6F \U0001EC70-\U0001ECBF \U0001ED00-\U0001ED4F \U0001EE00-\U0001EEFF] -Let $R_blocks = [\u0590-\u05FF \u07C0-\u085F \uFB1D-\uFB4F \U00010800-\U00010CFF \U00010D40-\U00010EBF \U00010F00-\U00010F2F \U00010F70-\U00010FFF \U0001E800-\U0001EC6F \U0001ECC0-\U0001ECFF \U0001ED50-\U0001EDFF \U0001EF00-\U0001EFFF] -# 6.1.0 updated blocks -# 10.0 updated blocks (Syriac Supplement is bc=AL) -# 11.0 updated blocks (Hanifi Rohingya, Sogdian, Indic Siyaq Numbers are bc=AL); Old Sogdian is bc=R -# 12.0 updated blocks (Ottoman Siyaq Numbers is bc=AL) -# 14.0 updated blocks (Arabic Extended-B is bc=AL) +Let $AL_blocks = [ + \u0600-\u07BF + \u0860-\u086F # Syriac Supplement, 10.0 + \u0870-\u089F # Arabic Extended-B, 14.0 + \u08A0-\u08FF + \uFB50-\uFDCF + \uFDF0-\uFDFF + \uFE70-\uFEFF + \U00010D00-\U00010D3F # Hanifi Rohingya, 11.0 + \U00010EC0-\U00010EFF + \U00010F30-\U00010F6F # Sogdian, 11.0 + \U0001EC70-\U0001ECBF # Indic Siyaq Numbers, 11.0 + \U0001ED00-\U0001ED4F # Ottoman Siyaq Numbers, 12.0 + \U0001EE00-\U0001EEFF +] +Let $R_blocks = [ + \u0590-\u05FF + \u07C0-\u085F + \uFB1D-\uFB4F + \U00010800-\U00010CFF + \U00010D40-\U00010EBF + \U00010F00-\U00010F2F + \U00010F70-\U00010FFF + \U0001E800-\U0001EC6F + \U0001ECC0-\U0001ECFF + \U0001ED50-\U0001EDFF + \U0001EF00-\U0001EFFF +] # Unassigned characters in these blocks have R or AL respectively \p{Bidi_Class=R} ⊇ [$R_blocks & \p{gc=Cn}] @@ -292,7 +321,14 @@ $AL_blocks ∥ [\p{Bidi_Class=L} \p{Bidi_Class=R}] Let $BN_Exceptions = [\u001C-\u001F\u17B4\u17B5] -[\p{Bidi_Class=BN}] = [\p{di}\p{nchar}\p{gc=Cc}-\p{gc=Mc}-\p{gc=Mn}-\p{gc=Me}-\p{Bidi_C}-\p{alpha}-\p{wspace} - $BN_Exceptions] +[\p{Bidi_Class=BN}] = [ + \p{di}\p{nchar}\p{gc=Cc} + - \p{gc=Mc} - \p{gc=Mn} - \p{gc=Me} + - \p{Bidi_C} + - \p{alpha} + - \p{wspace} + - $BN_Exceptions +] # Nonspacing and enclosing combining marks are bc=NSM, with a few exceptions (all of which are nonspacing) Let $gcMn_bcL = [\u0CBF\u0CC6\U00011A07\U00011A08\U00011C3F] @@ -425,25 +461,37 @@ In \P{U-1:GC=Cn}, ccc=U-1:ccc # Canonical decompositions (minus exclusions) must be identical across releases (also required by strong normalization stability), # except where a character and at least one character in its decomposition are both new in the release. -Let $New_Decompositions = [[\p{Decomposition_Type=Canonical} - \p{Full_Composition_Exclusion}] - [\p{U-1:Decomposition_Type=Canonical} - \p{U-1:Full_Composition_Exclusion}]] +Let $New_Decompositions = [ + [ \p{Decomposition_Type=Canonical} - \p{Full_Composition_Exclusion}] + - [\p{U-1:Decomposition_Type=Canonical} - \p{U-1:Full_Composition_Exclusion}] +] $New_Decompositions ⊆ \p{U-1:GC=Cn} # Stripping previously-unassigned characters from the current NFD does # something, that is, the decomposition contains newly-assigned characters. In $New_Decompositions, toNFD * \P{U-1:GC=Cn} ≠ toNFD -Let $Unicode_13_Decompositions = [[\p{U13.0.0:Decomposition_Type=Canonical} - \p{U13.0.0:Full_Composition_Exclusion}] - [\p{U12.1.0:Decomposition_Type=Canonical} - \p{U12.1.0:Full_Composition_Exclusion}]] +Let $Unicode_13_Decompositions = [ + [\p{U13.0.0:Decomposition_Type=Canonical} - \p{U13.0.0:Full_Composition_Exclusion}] + - [\p{U12.1.0:Decomposition_Type=Canonical} - \p{U12.1.0:Full_Composition_Exclusion}] +] $Unicode_13_Decompositions ⊆ \p{U12.1.0:GC=Cn} In $Unicode_13_Decompositions, toNFD * \P{U12.1.0:GC=Cn} ≠ toNFD $Unicode_13_Decompositions = [\U00011938] $Unicode_13_Decompositions = [\p{Name=DIVES AKURU VOWEL SIGN O}] -Let $Unicode_7_Decompositions = [[\p{U7.0.0:Decomposition_Type=Canonical} - \p{U7.0.0:Full_Composition_Exclusion}] - [\p{U6.3.0:Decomposition_Type=Canonical} - \p{U6.3.0:Full_Composition_Exclusion}]] +Let $Unicode_7_Decompositions = [ + [\p{U7.0.0:Decomposition_Type=Canonical} - \p{U7.0.0:Full_Composition_Exclusion}] + - [\p{U6.3.0:Decomposition_Type=Canonical} - \p{U6.3.0:Full_Composition_Exclusion}] +] $Unicode_7_Decompositions ⊆ \p{U6.3.0:GC=Cn} In $Unicode_7_Decompositions, toNFD * \P{U6.3.0:GC=Cn} ≠ toNFD $Unicode_7_Decompositions = [\U0001134B-\U0001134C \U000114BB-\U000114BC \U000114BE \U000115BA-\U000115BB] $Unicode_7_Decompositions ⊆ [\p{Name=/^(GRANTHA|TIRHUTA|SIDDHAM) VOWEL SIGN /}] -Let $Unicode_6_1_Decompositions = [[\p{U6.1.0:Decomposition_Type=Canonical} - \p{U6.1.0:Full_Composition_Exclusion}] - [\p{U6.0.0:Decomposition_Type=Canonical} - \p{U6.0.0:Full_Composition_Exclusion}]] +Let $Unicode_6_1_Decompositions = [ + [\p{U6.1.0:Decomposition_Type=Canonical} - \p{U6.1.0:Full_Composition_Exclusion}] + - [\p{U6.0.0:Decomposition_Type=Canonical} - \p{U6.0.0:Full_Composition_Exclusion}] +] $Unicode_6_1_Decompositions ⊆ \p{U6.0.0:GC=Cn} In $Unicode_6_1_Decompositions, toNFD * \P{U6.0.0:GC=Cn} ≠ toNFD $Unicode_6_1_Decompositions = [\U0001112E-\U0001112F] @@ -469,7 +517,9 @@ In $expandingCanonicalDecompositions, Decomposition_Type * (drop 1) * Decomposit # Not a stability policy, but it happens to be the case that the second # character does not have a decomposition mapping at all: In $expandingCanonicalDecompositions, Decomposition_Type * (drop 1) * Decomposition_Mapping = (constant None) -In $expandingCanonicalDecompositions, Decomposition_Mapping * (drop 1) * Decomposition_Mapping = (drop 1) * Decomposition_Mapping +In $expandingCanonicalDecompositions, + Decomposition_Mapping * (drop 1) * Decomposition_Mapping + = (drop 1) * Decomposition_Mapping # Stability: Canonical mappings (Decomposition_Mapping property values) are # always limited so that no string when normalized to NFC expands to more than @@ -488,7 +538,8 @@ In \P{U-1:GC=Cn}, dm=U-1:dm # must have ccc=0, except for the Decomposition_Mapping of the following four # characters: U+0344, U+0F73, U+0F75, U+0F81. Let $canonicallyExpandingNonstarters = [\u0344 \u0F73 \u0F75 \u0F81] -In [$expandingCanonicalDecompositions - $canonicallyExpandingNonstarters], ccc * (take 1) * Decomposition_Mapping = (constant Not_Reordered) +In [$expandingCanonicalDecompositions - $canonicallyExpandingNonstarters], + ccc * (take 1) * Decomposition_Mapping = (constant Not_Reordered) # U6.0: Construction of Full_Composition_Exclusion # Primary Composites don't include singletons, ccc!=0, or sequences starting with ccc!=0 @@ -547,7 +598,7 @@ Let $anyNumericValue = \p{Numeric_Value=/-?[0-9]+(.[0-9]+)?/} # Musical symbol combining marks, other oddities -Let $AlphaExclusions = [\uAA7D \u0F3E\u0F3F\u1063\u1064\u1069-\u106D\u1087-\u108C\u108F\u109A\u109B\u1CE1\u1CF7\uAA7B\uABEC\U0001D165\U0001D166\U0001D16D-\U0001D172][[:gc=mc:]&[:ccc=9:][\u302E\u302F]] +Let $AlphaExclusions = [[\uAA7D \u0F3E\u0F3F\u1063\u1064\u1069-\u106D\u1087-\u108C\u108F\u109A\u109B\u1CE1\u1CF7\uAA7B\uABEC\U0001D165\U0001D166\U0001D16D-\U0001D172][[:gc=mc:]&[:ccc=9:][\u302E\u302F]]] # 6.1.0 Added HANGUL SINGLE DOT TONE MARK..HANGUL DOUBLE DOT TONE MARK # 7.0 Added AA7D # 10.0 Added 1CF7 (similar to 1CE1) @@ -584,7 +635,13 @@ Show [\u20b9] Let $nonAlphabeticBindus = [] [\p{InSc=Bindu} - \p{Alphabetic}] = $nonAlphabeticBindus -Let $nonAlphabeticDependentVowels = [\N{ORIYA SIGN OVERLINE}\N{THAI CHARACTER MAITAIKHU}\N{LIMBU SIGN KEMPHRENG}\N{SHARADA VOWEL MODIFIER MARK}\N{SHARADA EXTRA SHORT VOWEL MARK}] +Let $nonAlphabeticDependentVowels = [ + \N{ORIYA SIGN OVERLINE} + \N{THAI CHARACTER MAITAIKHU} + \N{LIMBU SIGN KEMPHRENG} + \N{SHARADA VOWEL MODIFIER MARK} + \N{SHARADA EXTRA SHORT VOWEL MARK} +] [\p{InSC=Vowel_Dependent} - \p{Alphabetic}] = $nonAlphabeticDependentVowels # Several invariants from L2/24-009 item 2.2. @@ -596,7 +653,11 @@ Let $nonAlphabeticAvagrahas = [\N{TIBETAN MARK PALUTA}] # A punctuation mark. [\p{InSC=Avagraha} - $nonAlphabeticAvagrahas] ⊆ \p{Alphabetic} # Name-based checks. -Let $nonLowercaseSmallLetters = [ \p{name=/^LIMBU SMALL LETTER/} \N{TURNED GREEK SMALL LETTER IOTA} \p{name=/^(SQUARED|PARENTHESIZED|TAG) LATIN SMALL LETTER/} ] +Let $nonLowercaseSmallLetters = [ + \p{name=/^LIMBU SMALL LETTER/} + \N{TURNED GREEK SMALL LETTER IOTA} + \p{name=/^(SQUARED|PARENTHESIZED|TAG) LATIN SMALL LETTER/} +] Let $nonLowercaseSmallModifierLetters = [ \p{gc=Lm} & \p{name=/^ARABIC SMALL/} ] [ \p{name=/\bSMALL LETTER\b/}-\p{gc=Mn}-\p{gc=Lt} - $nonLowercaseSmallLetters ] ⊆ \p{Lowercase} [ [\p{gc=Lm} & \p{name=/SMALL/}] - $nonLowercaseSmallModifierLetters ] ⊆ \p{Lowercase} @@ -633,14 +694,39 @@ In \P{Other_Joining_Type=Deduce_From_General_Category}, Joining_Type = Other_Joi # LineBreak property ########################## -Let $IDInclusions = [[:block=/Ideographs/:] [[\U00020000-\U0003FFFF][\U0001F000-\U0001FFFF] - [[:block=Symbols for Legacy Computing:][:block=Supplemental Arrows C:]]] & [:gc=Cn:] - [:NChar:]] -# 9.0 Added range 1F000..1FFFF: all undesignated code points in this range are lb=ID -# 13.0 exclude those in 1FB00..1FBFF Symbols for Legacy Computing -# 16.0 exclude Supplemental Arrows C +Let $IDInclusions = [ + [:block=/Ideographs/:] + [ # Some ranges default to lb=ID even outside of any blocks: + [\U00020000-\U0003FFFF] # Planes 2 and 3, lb=ID since 5.2, 115-C27. + [\U0001F000-\U0001FFFF] # SMP range lb=ID by default since 9.0, 147-C25, + - [ # with exceptions: + [:block=Symbols for Legacy Computing:] # since 13.0, 162-A67; + [:block=Supplemental Arrows C:] # since 16.0, 177-C47. + ] + ] & [:gc=Cn:] - [:NChar:] +] \p{LB=ID} ⊃ $IDInclusions -\p{Line_Break=Unknown} = [\p{General_Category=Unassigned} \p{GeneralCategory=PrivateUse} - $IDInclusions - [\u20C0-\u20CF]] - -Let $BrahmicLineBreaking = [\p{sc=Balinese}\p{sc=Batak}\p{sc=Brahmi}\p{sc=Cham}\p{sc=DivesAkuru}\p{sc=Grantha}\p{sc=Javanese}\p{sc=Makasar}\p{sc=Kawi}\p{sc=Cham}\p{sc=Makasar}\p{sc=Tulu_Tigalari}\p{sc=Gurung_Khema}] +\p{Line_Break=Unknown} = [ + \p{General_Category=Unassigned} \p{GeneralCategory=PrivateUse} + - $IDInclusions + - [\u20C0-\u20CF] # Unassigned currency symbols are lb=PR since 6.3, 133-C26. +] + +Let $BrahmicLineBreaking = [ + \p{sc=Balinese} + \p{sc=Batak} + \p{sc=Brahmi} + \p{sc=Cham} + \p{sc=DivesAkuru} + \p{sc=Grantha} + \p{sc=Javanese} + \p{sc=Makasar} + \p{sc=Kawi} + \p{sc=Cham} + \p{sc=Makasar} + \p{sc=Tulu_Tigalari} + \p{sc=Gurung_Khema} +] Let $VFScripts = [\p{sc=Batak}] Let $OPInclusions = [\u00A1\u00BF\u2E18\U00013258-\U0001325A\U00013286\U00013288\U00013379\U0001342F\U00013437\U0001343C\U0001343E\U000145CE\U0001E95E-\U0001E95F] @@ -658,9 +744,15 @@ Let $OPInclusions = [\u00A1\u00BF\u2E18\U00013258-\U0001325A\U00013286\U00013288 \p{LB=VI} = [[\p{Indic_Syllabic_Category=Virama}\p{Indic_Syllabic_Category=Invisible_Stacker}] & $BrahmicLineBreaking] \p{LB=VF} = [\p{Indic_Syllabic_Category=Reordering_Killer} & $VFScripts] -# 15.1: Action item UTC-176-A81: change [[:PCM:]-\u070F] lb=AL->NU -\p{LB=CM} = [[\u3035] \p{GC=Mn} \p{GC=Me} \p{GC=Mc} \p{GC=Cc} \p{GC=Cf} -[\U00013437\U00013438\U0001343C-\U0001343F] -\p{LB=SA} -\p{LB=WJ} -\p{LB=ZW} -\p{LB=BA} -\p{LB=LF} -\p{LB=BK} -\p{LB=CR} -\p{LB=NL} -\p{LB=GL} -\p{LB=AL} -\p{LB=ZWJ} - \p{LB=VI} - \p{LB=VF} - \p{LB=NU}] -# Excluded Egyptian controls begin/end segment etc. 13437, 13438 & 1343C..1343F (gc=Cf, lb=OP/CL) +\p{LB=CM} = [ + [\u3035] \p{GC=Mn} \p{GC=Me} \p{GC=Mc} \p{GC=Cc} \p{GC=Cf} + - [\U00013437\U00013438\U0001343C-\U0001343F] # Egyptian controls begin/end segment etc. (gc=Cf, lb=OP/CL) + - \p{LB=SA} - \p{LB=WJ} - \p{LB=ZW} - \p{LB=BA} + - \p{LB=LF} - \p{LB=BK} - \p{LB=CR} - \p{LB=NL} + - \p{LB=GL} - \p{LB=AL} - \p{LB=ZWJ} + - \p{LB=VI} - \p{LB=VF} + - \p{LB=NU} # 176-A81 changed [[:PCM:]-\u070F] from lb=AL to lb=NU +] # 3.0.0: Numeric characters consist of decimal digits (all characters of General_Category Nd), # except those with East_Asian_Width F (Fullwidth) @@ -725,7 +817,17 @@ Let $QUInclusions = [\u275F-\u2760 \U0001F676-\U0001F678 \u0022 \u0027 \u275B-\u # covered by adding them to the exception set $SAScriptExceptions for the test. # SA are limited to certain scripts: -Let $SAScripts = [\p{script=ahom} \p{script=thai} \p{script=lao} \p{script=myanmar} \p{script=khmer} \p{script=Tai_Le} \p{script=New_Tai_Lue} \p{script=Tai_Tham} \p{script=Tai_Viet}] +Let $SAScripts = [ + \p{script=ahom} + \p{script=thai} + \p{script=lao} + \p{script=myanmar} + \p{script=khmer} + \p{script=Tai_Le} + \p{script=New_Tai_Lue} + \p{script=Tai_Tham} + \p{script=Tai_Viet} +] $SAScripts ⊇ \p{LineBreak=SA} # And in $SA scripts, they are all the alphabetic spacing characters, plus some odd Cf & Mn, plus the NEW TAI LUE THAM DIGIT ONE @@ -845,9 +947,12 @@ Let $PostBaseSpacingMarks_Missed = [] Let $TwoForgottenMusicalSymbols = \p{Name=/^MUSICAL SYMBOL COMBINING (SPRECHGESANG STEM|AUGMENTATION DOT)$/} Let $FourteenSpacingViramas = [\p{U15.1.0:ccc=9}&\p{U15.1.0:gc=Mc}] Let $TwoVietnameseReadingMarks = [\p{U15.1.0:ccc=6}] -[\P{U4.0.0:ccc=0} - \p{U4.0.0:Grapheme_Extend}] = [$TwoForgottenMusicalSymbols \p{Name=/^MUSICAL SYMBOL COMBINING FLAG-[3-5]$/}] +[\P{U4.0.0:ccc=0} - \p{U4.0.0:Grapheme_Extend}] = [$TwoForgottenMusicalSymbols + \p{Name=/^MUSICAL SYMBOL COMBINING FLAG-[3-5]$/}] [\P{U4.1.0:ccc=0} - \p{U4.1.0:GCB=Extend}] = $TwoForgottenMusicalSymbols -[\P{U15.1.0:ccc=0} - \p{U15.1.0:GCB=Extend}] = [$TwoForgottenMusicalSymbols $FourteenSpacingViramas $TwoVietnameseReadingMarks] +[\P{U15.1.0:ccc=0} - \p{U15.1.0:GCB=Extend}] = [$TwoForgottenMusicalSymbols + $FourteenSpacingViramas + $TwoVietnameseReadingMarks] \P{ ccc=0} ⊆ \p{ GCB=Extend} # Characters that appear in non-initial position in the canonical decomposition @@ -1037,7 +1142,17 @@ $NonOtherLetterIdeographs = [\p{Ideographic} - \p{gc=Lo}] Let $CommonIdeographs = [〆] $CommonIdeographs = [\p{Ideographic} & \p{sc=Common}] -\p{Ideographic} = [ $NonOtherLetterIdeographs $CommonIdeographs [ \p{gc=Lo} & [\p{Script=Han} \p{Script=Tangut} \p{Script=Nushu} \p{Script=Khitan_Small_Script}] ] ] +\p{Ideographic} = [ + $NonOtherLetterIdeographs $CommonIdeographs + [ + \p{gc=Lo} & [ + \p{Script=Han} + \p{Script=Tangut} + \p{Script=Nushu} + \p{Script=Khitan_Small_Script} + ] + ] +] [ [\p{Ideographic}&\p{sc=Han}] - \p{nfkcqc=n} - $NonOtherLetterIdeographs ] = \p{Unified_Ideograph} @@ -1046,7 +1161,19 @@ Let $unihanScope = [\p{Block=/^CJK.(Unified|Compatibility).Ideographs/} - \p{gc= $unihanScope = [\p{gc=Lo} & \p{sc=Hani}] $unihanScope = \P{kRSUnicode=@none@} $unihanScope = \P{kTotalStrokes=@none@} -$unihanScope = [ \P{kIRG_GSource=@none@} \P{kIRG_HSource=@none@} \P{kIRG_JSource=@none@} \P{kIRG_KPSource=@none@} \P{kIRG_KSource=@none@} \P{kIRG_MSource=@none@} \P{kIRG_SSource=@none@} \P{kIRG_TSource=@none@} \P{kIRG_UKSource=@none@} \P{kIRG_USource=@none@} \P{kIRG_VSource=@none@} ] +$unihanScope = [ + \P{kIRG_GSource=@none@} + \P{kIRG_HSource=@none@} + \P{kIRG_JSource=@none@} + \P{kIRG_KPSource=@none@} + \P{kIRG_KSource=@none@} + \P{kIRG_MSource=@none@} + \P{kIRG_SSource=@none@} + \P{kIRG_TSource=@none@} + \P{kIRG_UKSource=@none@} + \P{kIRG_USource=@none@} + \P{kIRG_VSource=@none@} +] # TODO(eggrobin): Should those two have a kMandarin, or this not actually an invariant? # See https://www.unicode.org/review/pri483/feedback.html#ID20240118004124.