From 0a04be9dd97ce190042caec6c042974de41cd881 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Tue, 25 Jun 2024 23:14:51 +0200 Subject: [PATCH 1/6] Allow line breaks in invariant test statements --- .../text/UCD/TestUnicodeInvariants.java | 388 ++++++++++-------- .../unicode/text/UCD/UnicodeInvariantTest.txt | 2 +- 2 files changed, 223 insertions(+), 167 deletions(-) diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java index 320b7d120..a245e641b 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java @@ -121,7 +121,6 @@ enum Expected { }; static final UnicodeSet INVARIANT_RELATIONS = new UnicodeSet("[=\u2282\u2283\u2286\u2287∥≉]"); - static final ParsePosition pp = new ParsePosition(0); private static PrintWriter out; @@ -179,102 +178,117 @@ public static int testInvariants(String inputFile, String suffix, boolean doRang } else { out3.write('\uFEFF'); // BOM } + final var noComments = new StringBuilder(); + final List lines = new ArrayList<>(); + final List lineBeginnings = new ArrayList(); try (final BufferedReader in = getInputReader(inputFile)) { - errorLister = - new BagFormatter() - .setMergeRanges(doRange) - .setLabelSource(null) - .setUnicodePropertyFactory(LATEST_PROPS) - // .setTableHtml("") - .setShowLiteral(toHTML) - .setFixName(toHTML); - errorLister.setShowTotal(false); - if (doHtml) { - errorLister.setTabber(htmlTabber); - } + in.lines() + .forEach( + line -> { + if (line.startsWith("\uFEFF")) { + line = line.substring(1); + } + lines.add(line); + lineBeginnings.add(noComments.length()); + final int pos = line.indexOf('#'); + if (pos >= 0) { + line = line.substring(0, pos); + } + noComments.append(line.trim() + '\n'); + }); + } + errorLister = + new BagFormatter() + .setMergeRanges(doRange) + .setLabelSource(null) + .setUnicodePropertyFactory(LATEST_PROPS) + // .setTableHtml("
") + .setShowLiteral(toHTML) + .setFixName(toHTML); + errorLister.setShowTotal(false); + if (doHtml) { + errorLister.setTabber(htmlTabber); + } - showLister = - new BagFormatter() - .setMergeRanges(doRange) - // .setLabelSource(null) - .setUnicodePropertyFactory(LATEST_PROPS) - // .setTableHtml("
") - .setShowLiteral(toHTML); - showLister.setShowTotal(false); - if (showScript) { - showLister.setValueSource(LATEST_PROPS.getProperty("script")); - } - if (doHtml) { - showLister.setTabber(htmlTabber); - } + showLister = + new BagFormatter() + .setMergeRanges(doRange) + // .setLabelSource(null) + .setUnicodePropertyFactory(LATEST_PROPS) + // .setTableHtml("
") + .setShowLiteral(toHTML); + showLister.setShowTotal(false); + if (showScript) { + showLister.setValueSource(LATEST_PROPS.getProperty("script")); + } + if (doHtml) { + showLister.setTabber(htmlTabber); + } - // symbolTable = new ChainedSymbolTable(); - // new ChainedSymbolTable(new SymbolTable[] { - // - // ToolUnicodePropertySource.make(UCD.lastVersion).getSymbolTable("\u00D7"), - // - // ToolUnicodePropertySource.make(Default.ucdVersion()).getSymbolTable("")}); - for (int lineNumber = 1; ; ++lineNumber) { - String line = in.readLine(); - if (line == null) { - break; - } - try { - if (line.startsWith("\uFEFF")) { - line = line.substring(1); - } - println(line); - line = line.trim(); - final int pos = line.indexOf('#'); - if (pos >= 0) { - line = line.substring(0, pos).trim(); - } - if (line.length() == 0) { - continue; - } - if (line.equalsIgnoreCase("Stop")) { - break; - } else if (line.startsWith("Let")) { - letLine(pp, line); - } else if (line.startsWith("In")) { - inLine(pp, line, inputFile, lineNumber); - } else if (line.startsWith("Propertywise")) { - propertywiseLine(pp, line, inputFile, lineNumber); - } else if (line.startsWith("ShowScript")) { - showScript = true; - } else if (line.startsWith("HideScript")) { - showScript = false; - } else if (line.startsWith("Map")) { - testMapLine(line, pp, lineNumber); - } else if (line.startsWith("ShowMap")) { - showMapLine(line, pp); - } else if (line.startsWith("Show")) { - showLine(line, pp); - } else if (line.startsWith("OnPairsOf")) { - equivalencesLine(line, pp, inputFile, lineNumber); - } else { - testLine(line, pp, inputFile, lineNumber); + final String source = noComments.toString(); + final Function getLineNumber = + position -> { + for (int i = 0; i < lineBeginnings.size(); ++i) { + if (lineBeginnings.get(i) > position.getIndex()) { + return i; // 1-based line number. + } } - } catch (final Exception e) { - parseErrorCount = - parseError(parseErrorCount, line, e, inputFile, lineNumber); - continue; - } + return lineBeginnings.size(); + }; + int lastPrintedLine = 0; + final ParsePosition pp = new ParsePosition(0); + for (; ; ) { + final int statementStart = pp.getIndex(); + final String nextToken = nextToken(pp, source); + while (getLineNumber.apply(pp) > lastPrintedLine) { + println(lines.get(lastPrintedLine++)); + } + if (nextToken == null) { + break; } - println(); - println("**** SUMMARY ****"); - println(); - println("# ParseErrorCount=" + parseErrorCount); - System.out.println("ParseErrorCount=" + parseErrorCount); - println("# TestFailureCount=" + testFailureCount); - System.out.println("TestFailureCount=" + testFailureCount); - if (doHtml) { - out3.println(""); + try { + if (nextToken.equals("Let")) { + letLine(pp, source); + } else if (nextToken.equals("In")) { + inLine(pp, source, inputFile, getLineNumber); + } else if (nextToken.equals("Propertywise")) { + propertywiseLine(pp, source, inputFile, getLineNumber); + } else if (nextToken.equals("Map")) { + testMapLine(source, pp, getLineNumber); + } else if (nextToken.equals("ShowMap")) { + showMapLine(source, pp); + } else if (nextToken.equals("Show")) { + showLine(source, pp); + } else if (nextToken.equals("OnPairsOf")) { + equivalencesLine(source, pp, inputFile, getLineNumber); + } else { + pp.setIndex(statementStart); + testLine(source, pp, inputFile, getLineNumber); + } + } catch (final Exception e) { + parseErrorCount = + parseError( + parseErrorCount, + source, + e, + statementStart, + inputFile, + getLineNumber.apply(pp)); + break; } - out2.append(writer.getBuffer()); } + println(); + println("**** SUMMARY ****"); + println(); + println("# ParseErrorCount=" + parseErrorCount); + System.out.println("ParseErrorCount=" + parseErrorCount); + println("# TestFailureCount=" + testFailureCount); + System.out.println("TestFailureCount=" + testFailureCount); + if (doHtml) { + out3.println(""); + } + out2.append(writer.getBuffer()); } - out = null; } return parseErrorCount + testFailureCount; } @@ -332,17 +346,20 @@ protected String getFailure(int codepoint) { } } - private static void propertywiseLine(ParsePosition pp, String line, String file, int lineNumber) + private static void propertywiseLine( + ParsePosition pp, + String line, + String file, + Function getLineNumber) throws ParseException { - pp.setIndex("Propertywise".length()); - final UnicodeSet set = new UnicodeSet(line, pp, symbolTable); + final UnicodeSet set = parseUnicodeSet(line, pp); if (set.hasStrings()) { throw new ParseException( "Set should contain only single code points for property comparison", pp.getIndex()); } expectToken("AreAlike", pp, line); - if (pp.getIndex() < line.length()) { + if (",".equals(nextToken(new ParsePosition(pp.getIndex()), line))) { expectToken(",", pp, line); expectToken("Except", pp, line); expectToken(":", pp, line); @@ -398,7 +415,9 @@ private static void propertywiseLine(ParsePosition pp, String line, String file, testFailureCount++; printErrorLine("Test Failure", Side.START, testFailureCount); reportTestFailure( - file, lineNumber, String.join("\n", errorMessageLines).replace('\t', ' ')); + file, + getLineNumber.apply(pp), + String.join("\n", errorMessageLines).replace('\t', ' ')); out.println("
"); for (String errorMessageLine : errorMessageLines) { out.println("
"); @@ -410,10 +429,13 @@ private static void propertywiseLine(ParsePosition pp, String line, String file, } } - private static void equivalencesLine(String line, ParsePosition pp, String file, int lineNumber) + private static void equivalencesLine( + String line, + ParsePosition pp, + String file, + Function getLineNumber) throws ParseException { - pp.setIndex("OnPairsOf".length()); - final UnicodeSet domain = new UnicodeSet(line, pp, symbolTable); + final UnicodeSet domain = parseUnicodeSet(line, pp); expectToken(",", pp, line); expectToken("EqualityOf", pp, line); final var leftProperty = CompoundProperty.of(LATEST_PROPS, line, pp); @@ -592,7 +614,9 @@ private static void equivalencesLine(String line, ParsePosition pp, String file, errorMessageLines.addAll(counterexamples); if (failure) { reportTestFailure( - file, lineNumber, String.join("\n", errorMessageLines).replace('\t', ' ')); + file, + getLineNumber.apply(pp), + String.join("\n", errorMessageLines).replace('\t', ' ')); } out.println(failure ? "" : "
"); for (String counterexample : counterexamples) { @@ -606,9 +630,12 @@ private static void equivalencesLine(String line, ParsePosition pp, String file, } } - private static void inLine(ParsePosition pp, String line, String file, int lineNumber) + private static void inLine( + ParsePosition pp, + String line, + String file, + Function getLineNumber) throws ParseException { - pp.setIndex(2); final PropertyPredicate propertyPredicate = getPropertyPredicate(pp, line); final UnicodeMap failures = propertyPredicate.getFailures(); final UnicodeSet failureSet = failures.keySet(); @@ -627,7 +654,8 @@ private static void inLine(ParsePosition pp, String line, String file, int lineN errorLister.setLineSeparator("\n"); errorLister.showSetNames(new PrintWriter(monoTable), failureSet); errorLister.setTabber(htmlTabber); - reportTestFailure(file, lineNumber, errorMessage + "\n" + monoTable.toString()); + reportTestFailure( + file, getLineNumber.apply(pp), errorMessage + "\n" + monoTable.toString()); if (doHtml) { out.println("
"); @@ -642,21 +670,42 @@ private static void inLine(ParsePosition pp, String line, String file, int lineN } } - private static void expectToken(String token, ParsePosition pp, String line) + private static String nextTokenNoSpace(ParsePosition pp, String text) { + if (pp.getIndex() == text.length()) { + return null; + } + int start = pp.getIndex(); + if (PATTERN_SYNTAX.contains(text.codePointAt(start))) { + final String result = Character.toString(text.codePointAt(start)); + pp.setIndex(start + result.length()); + return result; + } else { + final String result = scan(PATTERN_SYNTAX_OR_WHITE_SPACE, text, pp, false); + return result.isEmpty() ? null : result; + } + } + + private static String nextToken(ParsePosition pp, String text) { + scan(PATTERN_WHITE_SPACE, text, pp, true); + return nextTokenNoSpace(pp, text); + } + + private static void expectToken(String token, ParsePosition pp, String text) throws ParseException { - scan(PATTERN_WHITE_SPACE, line, pp, true); - if (!line.substring(pp.getIndex()).startsWith(token)) { - throw new ParseException("Expected " + token, pp.getIndex()); + final var next = new ParsePosition(pp.getIndex()); + final String actual = nextToken(next, text); + if (!token.equals(actual)) { + throw new ParseException( + "Expected '" + token + "', got '" + actual + "'", pp.getIndex()); } - pp.setIndex(pp.getIndex() + token.length()); - scan(PATTERN_WHITE_SPACE, line, pp, true); + pp.setIndex(next.getIndex()); } private static PropertyPredicate getPropertyPredicate(ParsePosition pp, String line) throws ParseException { PropertyPredicate predicate; - final UnicodeSet valueSet = new UnicodeSet(line, pp, symbolTable); + final UnicodeSet valueSet = parseUnicodeSet(line, pp); expectToken(",", pp, line); final UnicodeProperty property1 = CompoundProperty.of(LATEST_PROPS, line, pp); final int cp = line.codePointAt(pp.getIndex()); @@ -674,7 +723,7 @@ private static PropertyPredicate getPropertyPredicate(ParsePosition pp, String l final var containment = new PropertyValueContainment(); containment.shouldBeInSet = cp == '∈'; pp.setIndex(pp.getIndex() + 1); - containment.set = new UnicodeSet(line, pp, symbolTable); + containment.set = parseUnicodeSet(line, pp); predicate = containment; break; default: @@ -683,9 +732,6 @@ private static PropertyPredicate getPropertyPredicate(ParsePosition pp, String l predicate.valueSet = valueSet; predicate.property1 = property1; scan(PATTERN_WHITE_SPACE, line, pp, true); - if (pp.getIndex() != line.length()) { - throw new ParseException(line, pp.getIndex()); - } return predicate; } @@ -705,8 +751,8 @@ enum Type { private Function, String> sequenceReduction; } - private static final UnicodeSet PROPCHARS = - new UnicodeSet("[a-zA-Z0-9.\\:\\-\\_\\u0020\\p{pattern white space}]"); + // TODO(egg): Consider bringing back Pattern_White_Space if requiring semicolons. + private static final UnicodeSet PROPCHARS = new UnicodeSet("[a-zA-Z0-9.\\:\\-\\_\\u0020}]"); private final List propOrFilters = new ArrayList(); static UnicodeProperty of( @@ -722,7 +768,7 @@ static UnicodeProperty of( } else if (line.charAt(pp.getIndex()) == '(') { final FilterOrProp propOrFilter = new FilterOrProp(); final var matcher = - Pattern.compile("(\\( *([^ )]+)(?: +([^)]+))? *\\)).*") + Pattern.compile("(\\( *([^ )]+)(?: +([^)]+))? *\\)).*", Pattern.DOTALL) .matcher(line.substring(pp.getIndex())); if (!matcher.matches()) { throw new IllegalArgumentException( @@ -964,37 +1010,33 @@ protected String _getVersion() { } } - private static void letLine(ParsePosition pp, String line) { - final int x = line.indexOf('='); - final String variable = line.substring(3, x).trim(); - if (!variable.startsWith("$")) { - throw new IllegalArgumentException("Variable must begin with '$': "); - } - final String value = line.substring(x + 1).trim(); - pp.setIndex(0); - final UnicodeSet valueSet = new UnicodeSet("[" + value + "]", pp, symbolTable); + private static void letLine(ParsePosition pp, String source) throws ParseException { + expectToken("$", pp, source); + final String variable = nextTokenNoSpace(pp, source); + expectToken("=", pp, source); + final int valueStart = pp.getIndex(); + final UnicodeSet valueSet = parseUnicodeSet(source, pp); valueSet.complement().complement(); - symbolTable.add(variable.substring(1), valueSet.toPattern(false)); + symbolTable.add(variable, valueSet.toPattern(false)); + final String value = source.substring(valueStart, pp.getIndex()); if (DEBUG) { System.out.println("Added variable: <" + variable + "><" + value + ">"); } - showSet(pp, value); + showSet(new ParsePosition(0), value); } - private static void showLine(String line, ParsePosition pp) { - String part = line.substring(4).trim(); - if (part.startsWith("Each")) { - part = part.substring(4).trim(); + private static void showLine(String source, ParsePosition pp) { + final var next = new ParsePosition(pp.getIndex()); + if (next.equals("Each")) { showLister.setMergeRanges(false); } - showSet(pp, part); + showSet(pp, source); showLister.setMergeRanges(doRange); } private static void showMapLine(String line, ParsePosition pp) { String part = line.substring(7).trim(); - pp.setIndex(0); pp.setErrorIndex(-1); if (part.startsWith("Each")) { part = part.substring(4).trim(); @@ -1009,33 +1051,28 @@ private static void showMapLine(String line, ParsePosition pp) { showLister.setMergeRanges(doRange); } - private static void testLine(String line, ParsePosition pp, String file, int lineNumber) + private static void testLine( + String source, + ParsePosition pp, + String file, + Function getLineNumber) throws ParseException { - if (line.startsWith("Test")) { - line = line.substring(4).trim(); - } - char relation = 0; String rightSide = null; String leftSide = null; UnicodeSet leftSet = null; UnicodeSet rightSet = null; - pp.setIndex(0); - leftSet = new UnicodeSet(line, pp, symbolTable); - leftSide = line.substring(0, pp.getIndex()); - scan(PATTERN_WHITE_SPACE, line, pp, true); - relation = line.charAt(pp.getIndex()); + final int leftStart = pp.getIndex(); + leftSet = parseUnicodeSet(source, pp); + leftSide = source.substring(leftStart, pp.getIndex()); + scan(PATTERN_WHITE_SPACE, source, pp, true); + relation = source.charAt(pp.getIndex()); checkRelation(pp, relation); pp.setIndex(pp.getIndex() + 1); // skip char - scan(PATTERN_WHITE_SPACE, line, pp, true); - final int start = pp.getIndex(); - rightSet = new UnicodeSet(line, pp, symbolTable); - rightSide = line.substring(start, pp.getIndex()); - scan(PATTERN_WHITE_SPACE, line, pp, true); - if (line.length() != pp.getIndex()) { - throw new ParseException("Extra characters at end", pp.getIndex()); - } + final int rightStart = pp.getIndex(); + rightSet = parseUnicodeSet(source, pp); + rightSide = source.substring(rightStart, pp.getIndex()); Expected right_left = Expected.irrelevant; Expected rightAndLeft = Expected.irrelevant; @@ -1078,7 +1115,7 @@ private static void testLine(String line, ParsePosition pp, String file, int lin "But Not In", leftSide, file, - lineNumber); + getLineNumber.apply(pp)); checkExpected( rightAndLeft, new UnicodeSet(rightSet).retainAll(leftSet), @@ -1087,7 +1124,7 @@ private static void testLine(String line, ParsePosition pp, String file, int lin "And In", leftSide, file, - lineNumber); + getLineNumber.apply(pp)); checkExpected( left_right, new UnicodeSet(leftSet).removeAll(rightSet), @@ -1096,7 +1133,7 @@ private static void testLine(String line, ParsePosition pp, String file, int lin "But Not In", rightSide, file, - lineNumber); + getLineNumber.apply(pp)); } public static void checkRelation(ParsePosition pp, char relation) throws ParseException { @@ -1176,7 +1213,8 @@ private static void checkExpected( getProperties(Settings.lastVersion), IndexUnicodeProperties.make(Settings.lastVersion))); - private static void testMapLine(String line, ParsePosition pp, int lineNumber) + private static void testMapLine( + String line, ParsePosition pp, Function getLineNumber) throws ParseException { char relation = 0; String rightSide = null; @@ -1184,7 +1222,6 @@ private static void testMapLine(String line, ParsePosition pp, int lineNumber) UnicodeMap leftSet = null; UnicodeMap rightSet = null; - pp.setIndex(3); leftSet = UMP.parse(line, pp); leftSide = line.substring(3, pp.getIndex()); scan(PATTERN_WHITE_SPACE, line, pp, true); @@ -1240,7 +1277,7 @@ private static void testMapLine(String line, ParsePosition pp, int lineNumber) rightSide, "But Not In", leftSide, - lineNumber); + getLineNumber.apply(pp)); checkExpected( rightAndLeft, UnicodeMapParser.retainAll(new UnicodeMap().putAll(rightSet), leftSet), @@ -1248,7 +1285,7 @@ private static void testMapLine(String line, ParsePosition pp, int lineNumber) rightSide, "And In", leftSide, - lineNumber); + getLineNumber.apply(pp)); checkExpected( left_right, UnicodeMapParser.removeAll(new UnicodeMap().putAll(leftSet), rightSet), @@ -1256,7 +1293,7 @@ private static void testMapLine(String line, ParsePosition pp, int lineNumber) leftSide, "But Not In", rightSide, - lineNumber); + getLineNumber.apply(pp)); } private static void checkExpected( @@ -1303,8 +1340,7 @@ private static void checkExpected( } private static void showSet(ParsePosition pp, final String value) { - pp.setIndex(0); - UnicodeSet valueSet = new UnicodeSet(value, pp, symbolTable); + UnicodeSet valueSet = parseUnicodeSet(value, pp); final int totalSize = valueSet.size(); int abbreviated = 0; if (showRangeLimit >= 0) { @@ -1344,15 +1380,28 @@ private static void showSet(ParsePosition pp, final String value) { } private static int parseError( - int parseErrorCount, String line, Exception e, String file, int lineNumber) { + int parseErrorCount, + String source, + Exception e, + int statementStart, + String file, + int lineNumber) { parseErrorCount++; if (e instanceof ParseException) { final int index = ((ParseException) e).getErrorOffset(); - line = line.substring(0, index) + "☞" + line.substring(index); + final int eol = source.indexOf("\n", index); + source = + source.substring(statementStart, index) + + "☞" + + source.substring(index, eol >= 0 ? eol : source.length()); + } else { + final int sol = source.lastIndexOf("\n", statementStart); + final int eol = source.indexOf("\n", statementStart); + source = source.substring(sol >= 0 ? sol : 0, eol >= 0 ? eol : source.length()); } printErrorLine("Parse Failure", Side.START, parseErrorCount); - println("**** PARSE ERROR:\t" + line); + println("**** PARSE ERROR:\t" + source); out.println("
");
         final String message = e.getMessage();
         if (message != null) {
@@ -1400,6 +1449,10 @@ private static void printErrorLine(String title, Side side, int testFailureCount
             Transliterator.createFromRules("any-html", HTML_RULES_CONTROLS, Transliterator.FORWARD);
     private static final UnicodeSet PATTERN_WHITE_SPACE =
             new UnicodeSet("\\p{pattern white space}").freeze();
+    private static final UnicodeSet PATTERN_SYNTAX = new UnicodeSet("\\p{pattern syntax}").freeze();
+    private static final UnicodeSet PATTERN_SYNTAX_OR_WHITE_SPACE =
+            new UnicodeSet("[\\p{pattern white space}\\p{pattern syntax}]").freeze();
+
     private static int testFailureCount;
     private static int parseErrorCount;
     private static BagFormatter errorLister;
@@ -1518,7 +1571,7 @@ public int compare(String o1, String o2) {
 
         public void add(String variable, String value) {
             if (variables.containsKey(variable)) {
-                throw new IllegalArgumentException("Attempt to reset variable");
+                throw new IllegalArgumentException("Attempt to reset variable " + variable);
             }
             variables.put(variable, value.toCharArray());
         }
@@ -1576,7 +1629,10 @@ public boolean applyPropertyAlias(
         }
     }
 
-    public static UnicodeSet parseUnicodeSet(String line, ParsePosition pp) {
-        return new UnicodeSet(line, pp, symbolTable);
+    public static UnicodeSet parseUnicodeSet(String source, ParsePosition pp) {
+        final var relative = new ParsePosition(0);
+        final var result = new UnicodeSet(source.substring(pp.getIndex()), relative, symbolTable);
+        pp.setIndex(pp.getIndex() + relative.getIndex());
+        return result;
     }
 }
diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt
index 65d0004eb..6d56fa53b 100644
--- a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt
+++ b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt
@@ -547,7 +547,7 @@ Let $anyNumericValue = \p{Numeric_Value=/-?[0-9]+(.[0-9]+)?/}
 
 # Musical symbol combining marks, other oddities
 
-Let $AlphaExclusions = [\uAA7D \u0F3E\u0F3F\u1063\u1064\u1069-\u106D\u1087-\u108C\u108F\u109A\u109B\u1CE1\u1CF7\uAA7B\uABEC\U0001D165\U0001D166\U0001D16D-\U0001D172][[:gc=mc:]&[:ccc=9:][\u302E\u302F]]
+Let $AlphaExclusions = [[\uAA7D \u0F3E\u0F3F\u1063\u1064\u1069-\u106D\u1087-\u108C\u108F\u109A\u109B\u1CE1\u1CF7\uAA7B\uABEC\U0001D165\U0001D166\U0001D16D-\U0001D172][[:gc=mc:]&[:ccc=9:][\u302E\u302F]]]
 # 6.1.0 Added HANGUL SINGLE DOT TONE MARK..HANGUL DOUBLE DOT TONE MARK
 # 7.0 Added AA7D
 # 10.0 Added 1CF7 (similar to 1CE1)

From 1e5b44e6b12c5f9a580d6b48ff0a9ec933bc38b5 Mon Sep 17 00:00:00 2001
From: Robin Leroy 
Date: Wed, 26 Jun 2024 00:38:37 +0200
Subject: [PATCH 2/6] Break some overly long lines

---
 .../text/UCD/TestUnicodeInvariants.java       |   9 +-
 .../unicode/text/UCD/UnicodeInvariantTest.txt | 191 +++++++++++++++---
 2 files changed, 166 insertions(+), 34 deletions(-)

diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java
index a245e641b..8af2ce387 100644
--- a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java
+++ b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java
@@ -239,8 +239,9 @@ public static int testInvariants(String inputFile, String suffix, boolean doRang
                 final ParsePosition pp = new ParsePosition(0);
                 for (; ; ) {
                     final int statementStart = pp.getIndex();
+                    final int statementLineNumber = getLineNumber.apply(pp);
                     final String nextToken = nextToken(pp, source);
-                    while (getLineNumber.apply(pp) > lastPrintedLine) {
+                    while (statementLineNumber >= lastPrintedLine) {
                         println(lines.get(lastPrintedLine++));
                     }
                     if (nextToken == null) {
@@ -266,6 +267,10 @@ public static int testInvariants(String inputFile, String suffix, boolean doRang
                             testLine(source, pp, inputFile, getLineNumber);
                         }
                     } catch (final Exception e) {
+                        final int lineNumber = getLineNumber.apply(pp);
+                        while (lineNumber > lastPrintedLine) {
+                            println(lines.get(lastPrintedLine++));
+                        }
                         parseErrorCount =
                                 parseError(
                                         parseErrorCount,
@@ -274,7 +279,7 @@ public static int testInvariants(String inputFile, String suffix, boolean doRang
                                         statementStart,
                                         inputFile,
                                         getLineNumber.apply(pp));
-                        break;
+                        continue;
                     }
                 }
                 println();
diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt
index 6d56fa53b..d712d1b37 100644
--- a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt
+++ b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt
@@ -170,7 +170,15 @@ Let $fii = \p{toNFD=/$foo/}
 Let $codepoints = [\u0000-\U0010FFFF]
 
 Let $gcAllPunctuation = \p{gc=/_Punctuation/}
-$gcAllPunctuation = [\p{gc=Close_Punctuation}\p{gc=Connector_Punctuation}}\p{gc=Dash_Punctuation}\p{gc=Final_Punctuation}\p{gc=Initial_Punctuation}\p{gc=Open_Punctuation}\p{gc=Other_Punctuation}]
+$gcAllPunctuation = [
+    \p{gc=Close_Punctuation}
+    \p{gc=Connector_Punctuation}
+    \p{gc=Dash_Punctuation}
+    \p{gc=Final_Punctuation}
+    \p{gc=Initial_Punctuation}
+    \p{gc=Open_Punctuation}
+    \p{gc=Other_Punctuation}
+]
 
 Let $gcAllSymbols = \p{gc=/_Symbol/}
 $gcAllSymbols = [\p{gc=Math_Symbol}\p{gc=Currency_Symbol}\p{gc=Modifier_Symbol}\p{gc=Other_Symbol}]
@@ -269,13 +277,34 @@ Let $BMExclusions = [ ≠ ∤ ∦ ≢ ≭ ⫝̸ ]
 In [\p{dt=canonical}-$BMExclusions], (delete-adjacent-duplicates) * Bidi_M * \P{bc=NSM} * dm = Bidi_M * \P{bc=NSM}
 
 # Additional BIDI invariant constants
-Let $AL_blocks = [\u0600-\u07BF \u0860-\u08FF \uFB50-\uFDCF \uFDF0-\uFDFF \uFE70-\uFEFF \U00010D00-\U00010D3F \U00010EC0-\U00010EFF \U00010F30-\U00010F6F \U0001EC70-\U0001ECBF \U0001ED00-\U0001ED4F \U0001EE00-\U0001EEFF]
-Let $R_blocks = [\u0590-\u05FF \u07C0-\u085F \uFB1D-\uFB4F \U00010800-\U00010CFF \U00010D40-\U00010EBF \U00010F00-\U00010F2F \U00010F70-\U00010FFF \U0001E800-\U0001EC6F \U0001ECC0-\U0001ECFF \U0001ED50-\U0001EDFF \U0001EF00-\U0001EFFF]
-# 6.1.0 updated blocks
-# 10.0 updated blocks (Syriac Supplement is bc=AL)
-# 11.0 updated blocks (Hanifi Rohingya, Sogdian, Indic Siyaq Numbers are bc=AL); Old Sogdian is bc=R
-# 12.0 updated blocks (Ottoman Siyaq Numbers is bc=AL)
-# 14.0 updated blocks (Arabic Extended-B is bc=AL)
+Let $AL_blocks = [
+    \u0600-\u07BF
+    \u0860-\u086F          # Syriac Supplement,     10.0
+    \u0870-\u089F          # Arabic Extended-B,     14.0
+    \u08A0-\u08FF
+    \uFB50-\uFDCF
+    \uFDF0-\uFDFF
+    \uFE70-\uFEFF
+    \U00010D00-\U00010D3F  # Hanifi Rohingya,       11.0
+    \U00010EC0-\U00010EFF
+    \U00010F30-\U00010F6F  # Sogdian,               11.0
+    \U0001EC70-\U0001ECBF  # Indic Siyaq Numbers,   11.0
+    \U0001ED00-\U0001ED4F  # Ottoman Siyaq Numbers, 12.0
+    \U0001EE00-\U0001EEFF
+]
+Let $R_blocks = [
+    \u0590-\u05FF
+    \u07C0-\u085F
+    \uFB1D-\uFB4F
+    \U00010800-\U00010CFF
+    \U00010D40-\U00010EBF
+    \U00010F00-\U00010F2F
+    \U00010F70-\U00010FFF
+    \U0001E800-\U0001EC6F
+    \U0001ECC0-\U0001ECFF
+    \U0001ED50-\U0001EDFF
+    \U0001EF00-\U0001EFFF
+]
 
 # Unassigned characters in these blocks have R or AL respectively
 \p{Bidi_Class=R} ⊇ [$R_blocks & \p{gc=Cn}]
@@ -292,7 +321,14 @@ $AL_blocks ∥ [\p{Bidi_Class=L} \p{Bidi_Class=R}]
 
 Let $BN_Exceptions = [\u001C-\u001F\u17B4\u17B5]
 
-[\p{Bidi_Class=BN}] = [\p{di}\p{nchar}\p{gc=Cc}-\p{gc=Mc}-\p{gc=Mn}-\p{gc=Me}-\p{Bidi_C}-\p{alpha}-\p{wspace} - $BN_Exceptions]
+[\p{Bidi_Class=BN}] = [
+    \p{di}\p{nchar}\p{gc=Cc}
+    - \p{gc=Mc} - \p{gc=Mn} - \p{gc=Me}
+    - \p{Bidi_C}
+    - \p{alpha}
+    - \p{wspace}
+    - $BN_Exceptions
+]
 
 # Nonspacing and enclosing combining marks are bc=NSM, with a few exceptions (all of which are nonspacing)
 Let $gcMn_bcL = [\u0CBF\u0CC6\U00011A07\U00011A08\U00011C3F]
@@ -425,25 +461,37 @@ In \P{U-1:GC=Cn}, ccc=U-1:ccc
 
 # Canonical decompositions (minus exclusions) must be identical across releases (also required by strong normalization stability),
 # except where a character and at least one character in its decomposition are both new in the release.
-Let $New_Decompositions = [[\p{Decomposition_Type=Canonical} - \p{Full_Composition_Exclusion}] - [\p{U-1:Decomposition_Type=Canonical} - \p{U-1:Full_Composition_Exclusion}]]
+Let $New_Decompositions = [
+      [    \p{Decomposition_Type=Canonical} -     \p{Full_Composition_Exclusion}]
+    - [\p{U-1:Decomposition_Type=Canonical} - \p{U-1:Full_Composition_Exclusion}]
+]
 $New_Decompositions ⊆ \p{U-1:GC=Cn}
 # Stripping previously-unassigned characters from the current NFD does
 # something, that is, the decomposition contains newly-assigned characters.
 In $New_Decompositions, toNFD * \P{U-1:GC=Cn} ≠ toNFD
 
-Let $Unicode_13_Decompositions = [[\p{U13.0.0:Decomposition_Type=Canonical} - \p{U13.0.0:Full_Composition_Exclusion}] - [\p{U12.1.0:Decomposition_Type=Canonical} - \p{U12.1.0:Full_Composition_Exclusion}]]
+Let $Unicode_13_Decompositions = [
+      [\p{U13.0.0:Decomposition_Type=Canonical} - \p{U13.0.0:Full_Composition_Exclusion}]
+    - [\p{U12.1.0:Decomposition_Type=Canonical} - \p{U12.1.0:Full_Composition_Exclusion}]
+]
 $Unicode_13_Decompositions ⊆ \p{U12.1.0:GC=Cn}
 In $Unicode_13_Decompositions, toNFD * \P{U12.1.0:GC=Cn} ≠ toNFD
 $Unicode_13_Decompositions = [\U00011938]
 $Unicode_13_Decompositions = [\p{Name=DIVES AKURU VOWEL SIGN O}]
 
-Let $Unicode_7_Decompositions = [[\p{U7.0.0:Decomposition_Type=Canonical} - \p{U7.0.0:Full_Composition_Exclusion}] - [\p{U6.3.0:Decomposition_Type=Canonical} - \p{U6.3.0:Full_Composition_Exclusion}]]
+Let $Unicode_7_Decompositions = [
+      [\p{U7.0.0:Decomposition_Type=Canonical} - \p{U7.0.0:Full_Composition_Exclusion}]
+    - [\p{U6.3.0:Decomposition_Type=Canonical} - \p{U6.3.0:Full_Composition_Exclusion}]
+]
 $Unicode_7_Decompositions ⊆ \p{U6.3.0:GC=Cn}
 In $Unicode_7_Decompositions, toNFD * \P{U6.3.0:GC=Cn} ≠ toNFD
 $Unicode_7_Decompositions = [\U0001134B-\U0001134C \U000114BB-\U000114BC \U000114BE \U000115BA-\U000115BB]
 $Unicode_7_Decompositions ⊆ [\p{Name=/^(GRANTHA|TIRHUTA|SIDDHAM) VOWEL SIGN /}]
 
-Let $Unicode_6_1_Decompositions = [[\p{U6.1.0:Decomposition_Type=Canonical} - \p{U6.1.0:Full_Composition_Exclusion}] - [\p{U6.0.0:Decomposition_Type=Canonical} - \p{U6.0.0:Full_Composition_Exclusion}]]
+Let $Unicode_6_1_Decompositions = [
+      [\p{U6.1.0:Decomposition_Type=Canonical} - \p{U6.1.0:Full_Composition_Exclusion}]
+    - [\p{U6.0.0:Decomposition_Type=Canonical} - \p{U6.0.0:Full_Composition_Exclusion}]
+]
 $Unicode_6_1_Decompositions ⊆ \p{U6.0.0:GC=Cn}
 In $Unicode_6_1_Decompositions, toNFD * \P{U6.0.0:GC=Cn} ≠ toNFD
 $Unicode_6_1_Decompositions = [\U0001112E-\U0001112F]
@@ -469,7 +517,9 @@ In $expandingCanonicalDecompositions, Decomposition_Type * (drop 1) * Decomposit
 # Not a stability policy, but it happens to be the case that the second
 # character does not have a decomposition mapping at all:
 In $expandingCanonicalDecompositions, Decomposition_Type * (drop 1) * Decomposition_Mapping = (constant None)
-In $expandingCanonicalDecompositions, Decomposition_Mapping * (drop 1) * Decomposition_Mapping = (drop 1) * Decomposition_Mapping
+In $expandingCanonicalDecompositions,
+      Decomposition_Mapping * (drop 1) * Decomposition_Mapping
+    =                         (drop 1) * Decomposition_Mapping
 
 # Stability: Canonical mappings (Decomposition_Mapping property values) are
 # always limited so that no string when normalized to NFC expands to more than
@@ -488,7 +538,8 @@ In \P{U-1:GC=Cn}, dm=U-1:dm
 # must have ccc=0, except for the Decomposition_Mapping of the following four
 # characters: U+0344, U+0F73, U+0F75, U+0F81.
 Let $canonicallyExpandingNonstarters = [\u0344 \u0F73 \u0F75 \u0F81]
-In [$expandingCanonicalDecompositions - $canonicallyExpandingNonstarters], ccc * (take 1) * Decomposition_Mapping = (constant Not_Reordered)
+In [$expandingCanonicalDecompositions - $canonicallyExpandingNonstarters],
+    ccc * (take 1) * Decomposition_Mapping = (constant Not_Reordered)
 
 # U6.0: Construction of Full_Composition_Exclusion
 # Primary Composites don't include singletons, ccc!=0, or sequences starting with ccc!=0
@@ -584,7 +635,13 @@ Show [\u20b9]
 Let $nonAlphabeticBindus = []
 [\p{InSc=Bindu} - \p{Alphabetic}] = $nonAlphabeticBindus
 
-Let $nonAlphabeticDependentVowels = [\N{ORIYA SIGN OVERLINE}\N{THAI CHARACTER MAITAIKHU}\N{LIMBU SIGN KEMPHRENG}\N{SHARADA VOWEL MODIFIER MARK}\N{SHARADA EXTRA SHORT VOWEL MARK}]
+Let $nonAlphabeticDependentVowels = [
+    \N{ORIYA SIGN OVERLINE}
+    \N{THAI CHARACTER MAITAIKHU}
+    \N{LIMBU SIGN KEMPHRENG}
+    \N{SHARADA VOWEL MODIFIER MARK}
+    \N{SHARADA EXTRA SHORT VOWEL MARK}
+]
 [\p{InSC=Vowel_Dependent} - \p{Alphabetic}] = $nonAlphabeticDependentVowels
 
 # Several invariants from L2/24-009 item 2.2.
@@ -596,7 +653,11 @@ Let $nonAlphabeticAvagrahas = [\N{TIBETAN MARK PALUTA}]  # A punctuation mark.
 [\p{InSC=Avagraha} - $nonAlphabeticAvagrahas] ⊆ \p{Alphabetic}
 
 # Name-based checks.
-Let $nonLowercaseSmallLetters = [ \p{name=/^LIMBU SMALL LETTER/} \N{TURNED GREEK SMALL LETTER IOTA} \p{name=/^(SQUARED|PARENTHESIZED|TAG) LATIN SMALL LETTER/} ]
+Let $nonLowercaseSmallLetters = [
+    \p{name=/^LIMBU SMALL LETTER/}
+    \N{TURNED GREEK SMALL LETTER IOTA}
+    \p{name=/^(SQUARED|PARENTHESIZED|TAG) LATIN SMALL LETTER/}
+]
 Let $nonLowercaseSmallModifierLetters = [ \p{gc=Lm} & \p{name=/^ARABIC SMALL/} ]
 [ \p{name=/\bSMALL LETTER\b/}-\p{gc=Mn}-\p{gc=Lt} - $nonLowercaseSmallLetters ] ⊆ \p{Lowercase}
 [ [\p{gc=Lm} & \p{name=/SMALL/}] - $nonLowercaseSmallModifierLetters ] ⊆ \p{Lowercase}
@@ -633,14 +694,39 @@ In \P{Other_Joining_Type=Deduce_From_General_Category}, Joining_Type = Other_Joi
 # LineBreak property
 ##########################
 
-Let $IDInclusions = [[:block=/Ideographs/:] [[\U00020000-\U0003FFFF][\U0001F000-\U0001FFFF] - [[:block=Symbols for Legacy Computing:][:block=Supplemental Arrows C:]]] & [:gc=Cn:] - [:NChar:]]
-# 9.0 Added range 1F000..1FFFF: all undesignated code points in this range are lb=ID
-# 13.0 exclude those in 1FB00..1FBFF Symbols for Legacy Computing
-# 16.0 exclude Supplemental Arrows C
+Let $IDInclusions = [
+    [:block=/Ideographs/:]
+    [
+        [\U00020000-\U0003FFFF]  # Planes 2 and 3, lb=ID since 5.2.
+        [\U0001F000-\U0001FFFF]  # lb=ID default since 9.0, 147-C25,
+        - [                      # with exceptions:
+              [:block=Symbols for Legacy Computing:]  # since 13.0, 115-C27
+              [:block=Supplemental Arrows C:]         # since 16.0, 177-C47.
+          ]
+    ] & [:gc=Cn:] - [:NChar:]
+]
 \p{LB=ID} ⊃ $IDInclusions
-\p{Line_Break=Unknown} = [\p{General_Category=Unassigned} \p{GeneralCategory=PrivateUse} - $IDInclusions - [\u20C0-\u20CF]]
-
-Let $BrahmicLineBreaking = [\p{sc=Balinese}\p{sc=Batak}\p{sc=Brahmi}\p{sc=Cham}\p{sc=DivesAkuru}\p{sc=Grantha}\p{sc=Javanese}\p{sc=Makasar}\p{sc=Kawi}\p{sc=Cham}\p{sc=Makasar}\p{sc=Tulu_Tigalari}\p{sc=Gurung_Khema}]
+\p{Line_Break=Unknown} = [
+      \p{General_Category=Unassigned} \p{GeneralCategory=PrivateUse}
+    - $IDInclusions
+    - [\u20C0-\u20CF]  # Unassigned currency symbols are lb=PR since 6.3, 133-C26.
+]
+
+Let $BrahmicLineBreaking = [
+    \p{sc=Balinese}
+    \p{sc=Batak}
+    \p{sc=Brahmi}
+    \p{sc=Cham}
+    \p{sc=DivesAkuru}
+    \p{sc=Grantha}
+    \p{sc=Javanese}
+    \p{sc=Makasar}
+    \p{sc=Kawi}
+    \p{sc=Cham}
+    \p{sc=Makasar}
+    \p{sc=Tulu_Tigalari}
+    \p{sc=Gurung_Khema}
+]
 Let $VFScripts = [\p{sc=Batak}]
 
 Let $OPInclusions = [\u00A1\u00BF\u2E18\U00013258-\U0001325A\U00013286\U00013288\U00013379\U0001342F\U00013437\U0001343C\U0001343E\U000145CE\U0001E95E-\U0001E95F]
@@ -658,9 +744,15 @@ Let $OPInclusions = [\u00A1\u00BF\u2E18\U00013258-\U0001325A\U00013286\U00013288
 \p{LB=VI} = [[\p{Indic_Syllabic_Category=Virama}\p{Indic_Syllabic_Category=Invisible_Stacker}] & $BrahmicLineBreaking]
 \p{LB=VF} = [\p{Indic_Syllabic_Category=Reordering_Killer} & $VFScripts]
 
-# 15.1: Action item UTC-176-A81: change [[:PCM:]-\u070F] lb=AL->NU
-\p{LB=CM} = [[\u3035] \p{GC=Mn} \p{GC=Me} \p{GC=Mc} \p{GC=Cc} \p{GC=Cf} -[\U00013437\U00013438\U0001343C-\U0001343F] -\p{LB=SA} -\p{LB=WJ} -\p{LB=ZW} -\p{LB=BA} -\p{LB=LF} -\p{LB=BK} -\p{LB=CR} -\p{LB=NL} -\p{LB=GL} -\p{LB=AL} -\p{LB=ZWJ} - \p{LB=VI} - \p{LB=VF} - \p{LB=NU}]
-# Excluded Egyptian controls begin/end segment etc. 13437, 13438 & 1343C..1343F (gc=Cf, lb=OP/CL)
+\p{LB=CM} = [
+      [\u3035] \p{GC=Mn} \p{GC=Me} \p{GC=Mc} \p{GC=Cc} \p{GC=Cf}
+    - [\U00013437\U00013438\U0001343C-\U0001343F]  # Egyptian controls begin/end segment etc. (gc=Cf, lb=OP/CL)
+    - \p{LB=SA} - \p{LB=WJ} - \p{LB=ZW} - \p{LB=BA}
+    - \p{LB=LF} - \p{LB=BK} - \p{LB=CR} - \p{LB=NL}
+    - \p{LB=GL} - \p{LB=AL} - \p{LB=ZWJ}
+    - \p{LB=VI} - \p{LB=VF}
+    - \p{LB=NU}  # 176-A81 changed [[:PCM:]-\u070F] from lb=AL to lb=NU
+]
 
 #  3.0.0: Numeric characters consist of decimal digits (all characters of General_Category Nd),
 #         except those with East_Asian_Width F (Fullwidth)
@@ -725,7 +817,17 @@ Let $QUInclusions = [\u275F-\u2760 \U0001F676-\U0001F678 \u0022 \u0027 \u275B-\u
 # covered by adding them to the exception set $SAScriptExceptions for the test.
 
 # SA are limited to certain scripts:
-Let $SAScripts = [\p{script=ahom} \p{script=thai} \p{script=lao} \p{script=myanmar} \p{script=khmer} \p{script=Tai_Le} \p{script=New_Tai_Lue} \p{script=Tai_Tham} \p{script=Tai_Viet}]
+Let $SAScripts = [
+    \p{script=ahom}
+    \p{script=thai}
+    \p{script=lao}
+    \p{script=myanmar}
+    \p{script=khmer}
+    \p{script=Tai_Le}
+    \p{script=New_Tai_Lue}
+    \p{script=Tai_Tham}
+    \p{script=Tai_Viet}
+]
 $SAScripts ⊇ \p{LineBreak=SA}
 
 # And in $SA scripts, they are all the alphabetic spacing characters, plus some odd Cf & Mn, plus the NEW TAI LUE THAM DIGIT ONE
@@ -845,9 +947,12 @@ Let $PostBaseSpacingMarks_Missed = []
 Let $TwoForgottenMusicalSymbols = \p{Name=/^MUSICAL SYMBOL COMBINING (SPRECHGESANG STEM|AUGMENTATION DOT)$/}
 Let $FourteenSpacingViramas = [\p{U15.1.0:ccc=9}&\p{U15.1.0:gc=Mc}]
 Let $TwoVietnameseReadingMarks = [\p{U15.1.0:ccc=6}]
-[\P{U4.0.0:ccc=0}  - \p{U4.0.0:Grapheme_Extend}] = [$TwoForgottenMusicalSymbols \p{Name=/^MUSICAL SYMBOL COMBINING FLAG-[3-5]$/}]
+[\P{U4.0.0:ccc=0}  - \p{U4.0.0:Grapheme_Extend}] = [$TwoForgottenMusicalSymbols
+                                                    \p{Name=/^MUSICAL SYMBOL COMBINING FLAG-[3-5]$/}]
 [\P{U4.1.0:ccc=0}  - \p{U4.1.0:GCB=Extend}]      = $TwoForgottenMusicalSymbols
-[\P{U15.1.0:ccc=0} - \p{U15.1.0:GCB=Extend}]     = [$TwoForgottenMusicalSymbols $FourteenSpacingViramas $TwoVietnameseReadingMarks]
+[\P{U15.1.0:ccc=0} - \p{U15.1.0:GCB=Extend}]     = [$TwoForgottenMusicalSymbols
+                                                    $FourteenSpacingViramas
+                                                    $TwoVietnameseReadingMarks]
  \P{        ccc=0} ⊆ \p{        GCB=Extend}
 
 # Characters that appear in non-initial position in the canonical decomposition
@@ -1037,7 +1142,17 @@ $NonOtherLetterIdeographs = [\p{Ideographic} - \p{gc=Lo}]
 Let $CommonIdeographs = [〆]
 $CommonIdeographs = [\p{Ideographic} & \p{sc=Common}]
 
-\p{Ideographic} = [ $NonOtherLetterIdeographs $CommonIdeographs [ \p{gc=Lo} & [\p{Script=Han} \p{Script=Tangut} \p{Script=Nushu} \p{Script=Khitan_Small_Script}] ] ]
+\p{Ideographic} = [
+    $NonOtherLetterIdeographs $CommonIdeographs
+    [
+        \p{gc=Lo} & [
+            \p{Script=Han}
+            \p{Script=Tangut}
+            \p{Script=Nushu}
+            \p{Script=Khitan_Small_Script}
+        ]
+    ]
+]
 
 [ [\p{Ideographic}&\p{sc=Han}] - \p{nfkcqc=n} - $NonOtherLetterIdeographs ] = \p{Unified_Ideograph}
 
@@ -1046,7 +1161,19 @@ Let $unihanScope = [\p{Block=/^CJK.(Unified|Compatibility).Ideographs/} - \p{gc=
 $unihanScope = [\p{gc=Lo} & \p{sc=Hani}]
 $unihanScope = \P{kRSUnicode=@none@}
 $unihanScope = \P{kTotalStrokes=@none@}
-$unihanScope = [ \P{kIRG_GSource=@none@} \P{kIRG_HSource=@none@} \P{kIRG_JSource=@none@} \P{kIRG_KPSource=@none@} \P{kIRG_KSource=@none@} \P{kIRG_MSource=@none@} \P{kIRG_SSource=@none@} \P{kIRG_TSource=@none@} \P{kIRG_UKSource=@none@} \P{kIRG_USource=@none@} \P{kIRG_VSource=@none@} ]
+$unihanScope = [
+    \P{kIRG_GSource=@none@}
+    \P{kIRG_HSource=@none@}
+    \P{kIRG_JSource=@none@}
+    \P{kIRG_KPSource=@none@}
+    \P{kIRG_KSource=@none@}
+    \P{kIRG_MSource=@none@}
+    \P{kIRG_SSource=@none@}
+    \P{kIRG_TSource=@none@}
+    \P{kIRG_UKSource=@none@}
+    \P{kIRG_USource=@none@}
+    \P{kIRG_VSource=@none@}
+]
 
 # TODO(eggrobin): Should those two have a kMandarin, or this not actually an invariant?
 # See https://www.unicode.org/review/pri483/feedback.html#ID20240118004124.

From f3a1b4f883cc7e94dd4c521b285cb74b53fa626b Mon Sep 17 00:00:00 2001
From: Robin Leroy 
Date: Wed, 26 Jun 2024 01:57:44 +0200
Subject: [PATCH 3/6] Better error reporting

---
 .../text/UCD/TestUnicodeInvariants.java       | 118 +++++++++++-------
 1 file changed, 76 insertions(+), 42 deletions(-)

diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java
index 8af2ce387..bb706a526 100644
--- a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java
+++ b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java
@@ -237,11 +237,12 @@ public static int testInvariants(String inputFile, String suffix, boolean doRang
                         };
                 int lastPrintedLine = 0;
                 final ParsePosition pp = new ParsePosition(0);
+                boolean followingParseError = false;
                 for (; ; ) {
                     final int statementStart = pp.getIndex();
                     final int statementLineNumber = getLineNumber.apply(pp);
                     final String nextToken = nextToken(pp, source);
-                    while (statementLineNumber >= lastPrintedLine) {
+                    while (lastPrintedLine < statementLineNumber) {
                         println(lines.get(lastPrintedLine++));
                     }
                     if (nextToken == null) {
@@ -266,20 +267,35 @@ public static int testInvariants(String inputFile, String suffix, boolean doRang
                             pp.setIndex(statementStart);
                             testLine(source, pp, inputFile, getLineNumber);
                         }
+                        followingParseError = false;
                     } catch (final Exception e) {
-                        final int lineNumber = getLineNumber.apply(pp);
-                        while (lineNumber > lastPrintedLine) {
-                            println(lines.get(lastPrintedLine++));
+                        if (!followingParseError) {
+                            final int lineNumber = getLineNumber.apply(pp);
+                            while (lineNumber > lastPrintedLine) {
+                                println(lines.get(lastPrintedLine++));
+                            }
+                            parseErrorCount =
+                                    parseError(
+                                            parseErrorCount,
+                                            source,
+                                            e,
+                                            statementStart,
+                                            inputFile,
+                                            getLineNumber.apply(pp));
+                        }
+                        // Give up on the whole line, it is unlikely to contain anything we can
+                        // parse.
+                        // Try parsing the next line, but since that may be the rest of what we
+                        // failed to parse,
+                        // do not report errors until we successfully parse *something*.
+                        final int nextLine = source.indexOf("\n", pp.getIndex());
+                        if (nextLine >= 0) {
+                            pp.setIndex(source.indexOf("\n", pp.getIndex()));
+                            followingParseError = true;
+                            continue;
+                        } else {
+                            break;
                         }
-                        parseErrorCount =
-                                parseError(
-                                        parseErrorCount,
-                                        source,
-                                        e,
-                                        statementStart,
-                                        inputFile,
-                                        getLineNumber.apply(pp));
-                        continue;
                     }
                 }
                 println();
@@ -359,7 +375,7 @@ private static void propertywiseLine(
             throws ParseException {
         final UnicodeSet set = parseUnicodeSet(line, pp);
         if (set.hasStrings()) {
-            throw new ParseException(
+            throw new BackwardParseException(
                     "Set should contain only single code points for property comparison",
                     pp.getIndex());
         }
@@ -697,13 +713,9 @@ private static String nextToken(ParsePosition pp, String text) {
 
     private static void expectToken(String token, ParsePosition pp, String text)
             throws ParseException {
-        final var next = new ParsePosition(pp.getIndex());
-        final String actual = nextToken(next, text);
-        if (!token.equals(actual)) {
-            throw new ParseException(
-                    "Expected '" + token + "', got '" + actual + "'", pp.getIndex());
+        if (!token.equals(nextToken(pp, text))) {
+            throw new BackwardParseException("Expected '" + token + "'", pp.getIndex());
         }
-        pp.setIndex(next.getIndex());
     }
 
     private static PropertyPredicate getPropertyPredicate(ParsePosition pp, String line)
@@ -761,24 +773,28 @@ enum Type {
         private final List propOrFilters = new ArrayList();
 
         static UnicodeProperty of(
-                UnicodeProperty.Factory propSource, String line, ParsePosition pp) {
+                UnicodeProperty.Factory propSource, String source, ParsePosition pp)
+                throws ParseException {
             final CompoundProperty result = new CompoundProperty();
             while (true) {
-                scan(PATTERN_WHITE_SPACE, line, pp, true);
-                if (UnicodeSet.resemblesPattern(line, pp.getIndex())) {
+                scan(PATTERN_WHITE_SPACE, source, pp, true);
+                if (UnicodeSet.resemblesPattern(source, pp.getIndex())) {
                     final FilterOrProp propOrFilter = new FilterOrProp();
-                    propOrFilter.filter = parseUnicodeSet(line, pp);
+                    propOrFilter.filter = parseUnicodeSet(source, pp);
                     propOrFilter.type = FilterOrProp.Type.filter;
                     result.propOrFilters.add(propOrFilter);
-                } else if (line.charAt(pp.getIndex()) == '(') {
+                } else if (source.charAt(pp.getIndex()) == '(') {
                     final FilterOrProp propOrFilter = new FilterOrProp();
                     final var matcher =
                             Pattern.compile("(\\( *([^ )]+)(?: +([^)]+))? *\\)).*", Pattern.DOTALL)
-                                    .matcher(line.substring(pp.getIndex()));
+                                    .matcher(source.subSequence(pp.getIndex(), source.length()));
                     if (!matcher.matches()) {
                         throw new IllegalArgumentException(
                                 "Expected ( ), got "
-                                        + line.substring(pp.getIndex()));
+                                        + source.substring(
+                                                pp.getIndex(),
+                                                Math.min(pp.getIndex() + 50, source.length()))
+                                        + "…");
                     }
                     propOrFilter.type = FilterOrProp.Type.sequenceTransformation;
                     final String expression = matcher.group(1);
@@ -851,7 +867,7 @@ static UnicodeProperty of(
                     result.propOrFilters.add(propOrFilter);
                     pp.setIndex(pp.getIndex() + expression.length());
                 } else {
-                    final String propName = scan(PROPCHARS, line, pp, true);
+                    final String propName = scan(PROPCHARS, source, pp, true);
                     if (propName.length() > 0) {
                         final FilterOrProp propOrFilter = new FilterOrProp();
                         final VersionedProperty xprop =
@@ -872,12 +888,12 @@ static UnicodeProperty of(
                         break;
                     }
                 }
-                scan(PATTERN_WHITE_SPACE, line, pp, true);
+                scan(PATTERN_WHITE_SPACE, source, pp, true);
                 final int pos = pp.getIndex();
-                if (pos == line.length()) {
+                if (pos == source.length()) {
                     break;
                 }
-                final int cp = line.charAt(pos);
+                final int cp = source.charAt(pos);
                 if (cp != '*') {
                     break;
                 }
@@ -1031,10 +1047,11 @@ private static void letLine(ParsePosition pp, String source) throws ParseExcepti
         showSet(new ParsePosition(0), value);
     }
 
-    private static void showLine(String source, ParsePosition pp) {
+    private static void showLine(String source, ParsePosition pp) throws ParseException {
         final var next = new ParsePosition(pp.getIndex());
-        if (next.equals("Each")) {
+        if (nextToken(next, source).equals("Each")) {
             showLister.setMergeRanges(false);
+            pp.setIndex(next.getIndex());
         }
         showSet(pp, source);
         showLister.setMergeRanges(doRange);
@@ -1344,7 +1361,7 @@ private static void checkExpected(
         nf.setGroupingUsed(true);
     }
 
-    private static void showSet(ParsePosition pp, final String value) {
+    private static void showSet(ParsePosition pp, final String value) throws ParseException {
         UnicodeSet valueSet = parseUnicodeSet(value, pp);
         final int totalSize = valueSet.size();
         int abbreviated = 0;
@@ -1397,7 +1414,7 @@ private static int parseError(
             final int eol = source.indexOf("\n", index);
             source =
                     source.substring(statementStart, index)
-                            + "☞"
+                            + (e instanceof BackwardParseException ? "☜" : "☞")
                             + source.substring(index, eol >= 0 ? eol : source.length());
         } else {
             final int sol = source.lastIndexOf("\n", statementStart);
@@ -1412,7 +1429,7 @@ private static int parseError(
         if (message != null) {
             println("##" + message);
         }
-        reportParseError(file, lineNumber, message);
+        reportParseError(file, lineNumber, message + "\n" + source);
         e.printStackTrace(out);
 
         out.println("
"); @@ -1448,7 +1465,8 @@ private static void printErrorLine(String title, Side side, int testFailureCount private static final String HTML_RULES_CONTROLS = HTML_RULES - + ":: [[:C:][:Z:][:whitespace:][:Default_Ignorable_Code_Point:] - [\\u0020\\u0009]] hex/unicode ; "; + + ":: [[:C:][:Z:][:whitespace:][:Default_Ignorable_Code_Point:] - [\\u0020\\u0009\\u000A]] hex/unicode ; " + + "\\u000A > '
'"; public static final Transliterator toHTMLControl = Transliterator.createFromRules("any-html", HTML_RULES_CONTROLS, Transliterator.FORWARD); @@ -1634,10 +1652,26 @@ public boolean applyPropertyAlias( } } - public static UnicodeSet parseUnicodeSet(String source, ParsePosition pp) { - final var relative = new ParsePosition(0); - final var result = new UnicodeSet(source.substring(pp.getIndex()), relative, symbolTable); - pp.setIndex(pp.getIndex() + relative.getIndex()); - return result; + // Some of our parse exceptions are thrown with a parse position before the problem. + // However, others are thrown with the parse position after the problem, so the message must be + // adjusted accordingly. + public static class BackwardParseException extends ParseException { + public BackwardParseException(String s, int errorOffset) { + super(s, errorOffset); + } + } + + public static UnicodeSet parseUnicodeSet(String source, ParsePosition pp) + throws ParseException { + try { + final var result = new UnicodeSet(source, pp, symbolTable); + return result; + } catch (IllegalArgumentException e) { + // ICU produces unhelpful messages when parsing UnicodeSet deep into + // a large string in a string that contains line terminators, as the + // whole string is escaped and printed. + final String message = e.getMessage().split(" at \"", 2)[0]; + throw new BackwardParseException(message, pp.getIndex()); + } } } From cde46bd20c2f69c993212364abad09d4ed4208a9 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 26 Jun 2024 02:14:16 +0200 Subject: [PATCH 4/6] correct references --- .../org/unicode/text/UCD/UnicodeInvariantTest.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt index d712d1b37..9f684144f 100644 --- a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt +++ b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt @@ -696,11 +696,11 @@ In \P{Other_Joining_Type=Deduce_From_General_Category}, Joining_Type = Other_Joi Let $IDInclusions = [ [:block=/Ideographs/:] - [ - [\U00020000-\U0003FFFF] # Planes 2 and 3, lb=ID since 5.2. - [\U0001F000-\U0001FFFF] # lb=ID default since 9.0, 147-C25, + [ # Some ranges default to lb=ID even outside of any blocks: + [\U00020000-\U0003FFFF] # Planes 2 and 3, lb=ID since 5.2, 115-C27. + [\U0001F000-\U0001FFFF] # SMP range lb=ID by default since 9.0, 147-C25, - [ # with exceptions: - [:block=Symbols for Legacy Computing:] # since 13.0, 115-C27 + [:block=Symbols for Legacy Computing:] # since 13.0, 162-A67; [:block=Supplemental Arrows C:] # since 16.0, 177-C47. ] ] & [:gc=Cn:] - [:NChar:] From 8620d050260ef5e04702c4817f6fedb1fd869da5 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 26 Jun 2024 02:37:50 +0200 Subject: [PATCH 5/6] Probably friendlier to make this one a lookahead --- .../java/org/unicode/text/UCD/TestUnicodeInvariants.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java index bb706a526..ef7d5a57c 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java @@ -713,9 +713,11 @@ private static String nextToken(ParsePosition pp, String text) { private static void expectToken(String token, ParsePosition pp, String text) throws ParseException { - if (!token.equals(nextToken(pp, text))) { - throw new BackwardParseException("Expected '" + token + "'", pp.getIndex()); + ParsePosition next = new ParsePosition(pp.getIndex()); + if (!token.equals(nextToken(next, text))) { + throw new ParseException("Expected '" + token + "'", pp.getIndex()); } + pp.setIndex(next.getIndex()); } private static PropertyPredicate getPropertyPredicate(ParsePosition pp, String line) From e31dbfaaade633128a997bf6abba0157033ecea3 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 26 Jun 2024 04:27:21 +0200 Subject: [PATCH 6/6] Nicer lookahead --- .../text/UCD/TestUnicodeInvariants.java | 95 ++++++++++++------- 1 file changed, 63 insertions(+), 32 deletions(-) diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java index ef7d5a57c..d809a1659 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java @@ -241,7 +241,7 @@ public static int testInvariants(String inputFile, String suffix, boolean doRang for (; ; ) { final int statementStart = pp.getIndex(); final int statementLineNumber = getLineNumber.apply(pp); - final String nextToken = nextToken(pp, source); + final var nextToken = Lookahead.oneToken(pp, source); while (lastPrintedLine < statementLineNumber) { println(lines.get(lastPrintedLine++)); } @@ -249,19 +249,19 @@ public static int testInvariants(String inputFile, String suffix, boolean doRang break; } try { - if (nextToken.equals("Let")) { + if (nextToken.accept("Let")) { letLine(pp, source); - } else if (nextToken.equals("In")) { + } else if (nextToken.accept("In")) { inLine(pp, source, inputFile, getLineNumber); - } else if (nextToken.equals("Propertywise")) { + } else if (nextToken.accept("Propertywise")) { propertywiseLine(pp, source, inputFile, getLineNumber); - } else if (nextToken.equals("Map")) { + } else if (nextToken.accept("Map")) { testMapLine(source, pp, getLineNumber); - } else if (nextToken.equals("ShowMap")) { + } else if (nextToken.accept("ShowMap")) { showMapLine(source, pp); - } else if (nextToken.equals("Show")) { + } else if (nextToken.accept("Show")) { showLine(source, pp); - } else if (nextToken.equals("OnPairsOf")) { + } else if (nextToken.accept("OnPairsOf")) { equivalencesLine(source, pp, inputFile, getLineNumber); } else { pp.setIndex(statementStart); @@ -380,8 +380,7 @@ private static void propertywiseLine( pp.getIndex()); } expectToken("AreAlike", pp, line); - if (",".equals(nextToken(new ParsePosition(pp.getIndex()), line))) { - expectToken(",", pp, line); + if (Lookahead.oneToken(pp, line).accept(",")) { expectToken("Except", pp, line); expectToken(":", pp, line); } @@ -691,33 +690,67 @@ private static void inLine( } } - private static String nextTokenNoSpace(ParsePosition pp, String text) { - if (pp.getIndex() == text.length()) { - return null; + // A one-token lookahead. + // Tokens are defined as runs of [^\p{Pattern_White_Space}\p{Pattern_Syntax}], + // or single code points in \p{Pattern_Syntax}. + private static class Lookahead { + // Advances pp through any pattern white space, then looks ahead one token. + public static Lookahead oneToken(ParsePosition pp, String text) { + scan(PATTERN_WHITE_SPACE, text, pp, true); + return oneTokenNoSpace(pp, text); } - int start = pp.getIndex(); - if (PATTERN_SYNTAX.contains(text.codePointAt(start))) { - final String result = Character.toString(text.codePointAt(start)); - pp.setIndex(start + result.length()); - return result; - } else { - final String result = scan(PATTERN_SYNTAX_OR_WHITE_SPACE, text, pp, false); - return result.isEmpty() ? null : result; + + // Returns null if pp is before pattern white space; otherwise, looks ahead one token. + public static Lookahead oneTokenNoSpace(ParsePosition pp, String text) { + ParsePosition next = new ParsePosition(pp.getIndex()); + if (next.getIndex() == text.length()) { + return null; + } + int start = next.getIndex(); + if (PATTERN_SYNTAX.contains(text.codePointAt(start))) { + final String result = Character.toString(text.codePointAt(start)); + next.setIndex(start + result.length()); + return new Lookahead(result, pp, next); + } else { + final String result = scan(PATTERN_SYNTAX_OR_WHITE_SPACE, text, next, false); + return result.isEmpty() ? null : new Lookahead(result, pp, next); + } + } + + private Lookahead(String token, ParsePosition pp, ParsePosition next) { + this.token = token; + this.pp = pp; + this.next = next; } - } - private static String nextToken(ParsePosition pp, String text) { - scan(PATTERN_WHITE_SPACE, text, pp, true); - return nextTokenNoSpace(pp, text); + // Advances the ParsePosition passed at construction past the token, and returns the token. + public String consume() { + pp.setIndex(next.getIndex()); + return token; + } + + // If this token is expected, advances the ParsePosition passed at construction past the + // token past it and returns true. + // Otherwise, this function no effect and returns false. + public boolean accept(String expected) { + if (expected.equals(token)) { + consume(); + return true; + } else { + return false; + } + } + + private final String token; + private final ParsePosition pp; + private final ParsePosition next; } private static void expectToken(String token, ParsePosition pp, String text) throws ParseException { - ParsePosition next = new ParsePosition(pp.getIndex()); - if (!token.equals(nextToken(next, text))) { + if (!Lookahead.oneToken(pp, text).accept(token)) { throw new ParseException("Expected '" + token + "'", pp.getIndex()); } - pp.setIndex(next.getIndex()); } private static PropertyPredicate getPropertyPredicate(ParsePosition pp, String line) @@ -1035,7 +1068,7 @@ protected String _getVersion() { private static void letLine(ParsePosition pp, String source) throws ParseException { expectToken("$", pp, source); - final String variable = nextTokenNoSpace(pp, source); + final String variable = Lookahead.oneTokenNoSpace(pp, source).consume(); expectToken("=", pp, source); final int valueStart = pp.getIndex(); final UnicodeSet valueSet = parseUnicodeSet(source, pp); @@ -1050,10 +1083,8 @@ private static void letLine(ParsePosition pp, String source) throws ParseExcepti } private static void showLine(String source, ParsePosition pp) throws ParseException { - final var next = new ParsePosition(pp.getIndex()); - if (nextToken(next, source).equals("Each")) { + if (Lookahead.oneToken(pp, source).accept("Each")) { showLister.setMergeRanges(false); - pp.setIndex(next.getIndex()); } showSet(pp, source); showLister.setMergeRanges(doRange);