From 85c2b67624b7d5d72e9815faeb8194fed5b89fcd Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 10 Jan 2024 13:06:01 +0100 Subject: [PATCH 01/15] Failing test --- .../unicode/text/UCD/UnicodeInvariantTest.txt | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt index 9fa0ca3a0..8a53f0eae 100644 --- a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt +++ b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt @@ -746,6 +746,32 @@ Let $PostBaseSpacingMarks_Tweak = [\u103B \u1056 \u1057 \u1A57 \u1A6D] Let $PostBaseSpacingMarks_Missed = [] [$PostBaseSpacingMarks_All - $PostBaseSpacingMarks_Tweak - $PostBaseSpacingMarks_Missed] ⊂ [:GCB=XX:] +# Check the consistency of grapheme cluster segmentation (both legacy and +# extended) with canonical equivalence. +# Non-starters are GCB=Extend or GCB=SpacingMark, so that GB9 and GB9a keep +# together any sequences that may be reordered by the Canonical Ordering +# Algorithm. +\P{U15.1.0:ccc=0} ⊆ [\p{U15.1.0:GCB=Extend}\p{U15.1.0:GCB=SpacingMark}] +\P{ccc=0} ⊆ [\p{GCB=Extend}\p{GCB=SpacingMark}] +# Non-starters are actually GCB=Extend, so that GB9 alone does the job, since +# there is no GB9a in legacy grapheme clusters. +# But not before Unicode Version 16.0, oops (see L2/24-009). +\P{U15.1.0:ccc=0} ⊆ \p{U15.1.0:GCB=Extend} +\P{ccc=0} ⊆ \p{GCB=Extend} + +# Characters that appear in non-initial position in the canonical decomposition +# of another character are either Extend, V, or T, so that sequences that are +# equivalent to a canonical composite are kept together by GB6..GB9. +# We only look at the starters, since we dealt with non-starters above. +# Characters that appear in non-initial position in the canonical decomposition +# of a primary composite are NFC_QC=Maybe. We would need to separately check +# the characters that appear in non-initial position in the canonical +# decomposition of a full composition exclusion. +# We would also need to separately check that the characters are T or V only +# appear in canonical decompositions where they follow an LV, LVT, V, or T, or +# an LV or V, respectively. +[\p{NFC_QC=Maybe}&\p{ccc=0}] ⊆ [\p{GCB=Extend}\p{GCB=T}\p{GCB=V}] + ########################## # Emoji ########################## From 6188e191ea638043531fd853f4b71d800ec24ea4 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 10 Jan 2024 13:43:00 +0100 Subject: [PATCH 02/15] an attempt at error messages --- .../text/UCD/TestUnicodeInvariants.java | 72 +++++++++++++------ 1 file changed, 50 insertions(+), 22 deletions(-) diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java index dbbea74f6..e1c9c3e5b 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java @@ -1,5 +1,6 @@ package org.unicode.text.UCD; +import com.google.common.net.UrlEscapers; import com.ibm.icu.dev.tool.UOption; import com.ibm.icu.dev.util.UnicodeMap; import com.ibm.icu.lang.UCharacter; @@ -207,7 +208,7 @@ public static int testInvariants(String inputFile, boolean doRange) throws IOExc // ToolUnicodePropertySource.make(UCD.lastVersion).getSymbolTable("\u00D7"), // // ToolUnicodePropertySource.make(Default.ucdVersion()).getSymbolTable("")}); - while (true) { + for (int lineNumber = 1; ; ++lineNumber) { String line = in.readLine(); if (line == null) { break; @@ -230,24 +231,24 @@ public static int testInvariants(String inputFile, boolean doRange) throws IOExc } else if (line.startsWith("Let")) { letLine(pp, line); } else if (line.startsWith("In")) { - inLine(pp, line); + inLine(pp, line, lineNumber); } else if (line.startsWith("ShowScript")) { showScript = true; } else if (line.startsWith("HideScript")) { showScript = false; } else if (line.startsWith("Map")) { - testMapLine(line, pp); + testMapLine(line, pp, lineNumber); } else if (line.startsWith("ShowMap")) { showMapLine(line, pp); } else if (line.startsWith("Show")) { showLine(line, pp); } else if (line.startsWith("EquivalencesOf")) { - equivalencesLine(line, pp); + equivalencesLine(line, pp, lineNumber); } else { - testLine(line, pp); + testLine(line, pp, lineNumber); } } catch (final Exception e) { - parseErrorCount = parseError(parseErrorCount, line, e); + parseErrorCount = parseError(parseErrorCount, line, e, lineNumber); continue; } } @@ -276,7 +277,9 @@ static class PropertyComparison { UnicodeProperty property2; } - private static void equivalencesLine(String line, ParsePosition pp) throws ParseException { + private static void equivalencesLine(String line, ParsePosition pp, int lineNumber) + throws ParseException { + // TODO(egg): ::error etc. pp.setIndex("EquivalencesOf".length()); final UnicodeSet domain = new UnicodeSet(line, pp, symbolTable); final var leftProperty = CompoundProperty.of(LATEST_PROPS, line, pp); @@ -457,7 +460,8 @@ private static void equivalencesLine(String line, ParsePosition pp) throws Parse } } - private static void inLine(ParsePosition pp, String line) throws ParseException { + private static void inLine(ParsePosition pp, String line, int lineNumber) + throws ParseException { pp.setIndex(2); final PropertyComparison propertyComparison = getPropertyComparison(pp, line); final UnicodeMap failures = new UnicodeMap<>(); @@ -476,6 +480,7 @@ private static void inLine(ParsePosition pp, String line) throws ParseException if (failureCount != 0) { testFailureCount++; printErrorLine("Test Failure", Side.START, testFailureCount); + // TODO(egg): ::error etc. println( "## Got unexpected " + (propertyComparison.shouldBeEqual ? "differences" : "equalities") @@ -710,7 +715,8 @@ private static void showMapLine(String line, ParsePosition pp) { showLister.setMergeRanges(doRange); } - private static void testLine(String line, ParsePosition pp) throws ParseException { + private static void testLine(String line, ParsePosition pp, int lineNumber) + throws ParseException { if (line.startsWith("Test")) { line = line.substring(4).trim(); } @@ -776,21 +782,24 @@ private static void testLine(String line, ParsePosition pp) throws ParseExceptio "In", rightSide, "But Not In", - leftSide); + leftSide, + lineNumber); checkExpected( rightAndLeft, new UnicodeSet(rightSet).retainAll(leftSet), "In", rightSide, "And In", - leftSide); + leftSide, + lineNumber); checkExpected( left_right, new UnicodeSet(leftSet).removeAll(rightSet), "In", leftSide, "But Not In", - rightSide); + rightSide, + lineNumber); } public static void checkRelation(ParsePosition pp, char relation) throws ParseException { @@ -810,7 +819,8 @@ private static void checkExpected( String rightStatus, String rightSide, String leftStatus, - String leftSide) { + String leftSide, + int lineNumber) { switch (expected) { case empty: if (segment.size() == 0) { @@ -829,9 +839,22 @@ private static void checkExpected( } testFailureCount++; printErrorLine("Test Failure", Side.START, testFailureCount); - println("## Expected " + expected + ", got: " + segment.size() + "\t" + segment.toString()); - println("## " + rightStatus + "\t" + rightSide); - println("## " + leftStatus + "\t" + leftSide); + final var errorMessage = + new String[] { + "Expected " + expected + ", got: " + segment.size() + "\t" + segment.toString(), + rightStatus + "\t" + rightSide, + leftStatus + "\t" + leftSide + }; + for (String line : errorMessage) { + println("## " + line); + } + System.err.println( + "::error file=unicodetools/src/main/resources/org/unicode/text/UCD/" + + DEFAULT_FILE + + ",line=" + + lineNumber + + "title=Invariant test failure::" + + UrlEscapers.urlFragmentEscaper().escape(String.join("\n", errorMessage))); if (doHtml) { out.println(""); } @@ -853,7 +876,8 @@ private static void checkExpected( getProperties(Settings.lastVersion), IndexUnicodeProperties.make(Settings.lastVersion))); - private static void testMapLine(String line, ParsePosition pp) throws ParseException { + private static void testMapLine(String line, ParsePosition pp, int lineNumber) + throws ParseException { char relation = 0; String rightSide = null; String leftSide = null; @@ -915,21 +939,24 @@ private static void testMapLine(String line, ParsePosition pp) throws ParseExcep "In", rightSide, "But Not In", - leftSide); + leftSide, + lineNumber); checkExpected( rightAndLeft, UnicodeMapParser.retainAll(new UnicodeMap().putAll(rightSet), leftSet), "In", rightSide, "And In", - leftSide); + leftSide, + lineNumber); checkExpected( left_right, UnicodeMapParser.removeAll(new UnicodeMap().putAll(leftSet), rightSet), "In", leftSide, "But Not In", - rightSide); + rightSide, + lineNumber); } private static void checkExpected( @@ -938,7 +965,8 @@ private static void checkExpected( String rightStatus, String rightSide, String leftStatus, - String leftSide) { + String leftSide, + int lineNumber) { switch (expected) { case empty: if (segment.size() == 0) { @@ -1015,7 +1043,7 @@ private static void showSet(ParsePosition pp, final String value) { println(); } - private static int parseError(int parseErrorCount, String line, Exception e) { + private static int parseError(int parseErrorCount, String line, Exception e, int lineNumber) { parseErrorCount++; if (e instanceof ParseException) { final int index = ((ParseException) e).getErrorOffset(); From afc7d8ca4a77951dd1ad0eca6cb5c1a338ad1430 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 10 Jan 2024 13:46:35 +0100 Subject: [PATCH 03/15] comma --- .../main/java/org/unicode/text/UCD/TestUnicodeInvariants.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java index e1c9c3e5b..4c3221472 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java @@ -853,7 +853,7 @@ private static void checkExpected( + DEFAULT_FILE + ",line=" + lineNumber - + "title=Invariant test failure::" + + ",title=Invariant test failure::" + UrlEscapers.urlFragmentEscaper().escape(String.join("\n", errorMessage))); if (doHtml) { out.println("
"); From 29b341f6333639c959672f1be80da5cd99505e8e Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 10 Jan 2024 14:20:40 +0100 Subject: [PATCH 04/15] table and less escaping --- .../unicode/text/UCD/TestUnicodeInvariants.java | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java index 4c3221472..128e25f52 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java @@ -1,6 +1,5 @@ package org.unicode.text.UCD; -import com.google.common.net.UrlEscapers; import com.ibm.icu.dev.tool.UOption; import com.ibm.icu.dev.util.UnicodeMap; import com.ibm.icu.lang.UCharacter; @@ -47,6 +46,7 @@ public class TestUnicodeInvariants { private static int showRangeLimit = 20; static boolean doHtml = true; public static final String DEFAULT_FILE = "UnicodeInvariantTest.txt"; + public static final HTMLTabber htmlTabber = new Tabber.HTMLTabber(); private static final int // HELP1 = 0, @@ -172,8 +172,6 @@ public static int testInvariants(String inputFile, boolean doRange) throws IOExc out3.write('\uFEFF'); // BOM } try (final BufferedReader in = getInputReader(inputFile)) { - final HTMLTabber tabber = new Tabber.HTMLTabber(); - errorLister = new BagFormatter() .setMergeRanges(doRange) @@ -184,7 +182,7 @@ public static int testInvariants(String inputFile, boolean doRange) throws IOExc .setFixName(toHTML); errorLister.setShowTotal(false); if (doHtml) { - errorLister.setTabber(tabber); + errorLister.setTabber(htmlTabber); } showLister = @@ -199,7 +197,7 @@ public static int testInvariants(String inputFile, boolean doRange) throws IOExc showLister.setValueSource(LATEST_PROPS.getProperty("script")); } if (doHtml) { - showLister.setTabber(tabber); + showLister.setTabber(htmlTabber); } // symbolTable = new ChainedSymbolTable(); @@ -845,16 +843,23 @@ private static void checkExpected( rightStatus + "\t" + rightSide, leftStatus + "\t" + leftSide }; + var monoTable = new StringWriter(); for (String line : errorMessage) { println("## " + line); } + errorLister.setTabber(new Tabber.MonoTabber()); + errorLister.setLineSeparator("\n"); + errorLister.showSetNames(new PrintWriter(monoTable), segment); System.err.println( "::error file=unicodetools/src/main/resources/org/unicode/text/UCD/" + DEFAULT_FILE + ",line=" + lineNumber + ",title=Invariant test failure::" - + UrlEscapers.urlFragmentEscaper().escape(String.join("\n", errorMessage))); + + (String.join("\n", errorMessage) + "\n" + monoTable.toString()) + .replace("%", "%25") + .replace("\n", "%0A")); + errorLister.setTabber(htmlTabber); if (doHtml) { out.println("
"); } From 0497007f6699a492f003c120aba6e32e6775ae3c Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 10 Jan 2024 14:41:30 +0100 Subject: [PATCH 05/15] Try to get the errors only once --- .github/workflows/cli-build-instructions.yml | 2 +- .../text/UCD/TestUnicodeInvariants.java | 21 +++++++++++-------- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/.github/workflows/cli-build-instructions.yml b/.github/workflows/cli-build-instructions.yml index 24d4dce1c..729d31bd8 100644 --- a/.github/workflows/cli-build-instructions.yml +++ b/.github/workflows/cli-build-instructions.yml @@ -170,7 +170,7 @@ jobs: - name: Run command - Build and Test run: | cd unicodetools/mine/src - MAVEN_OPTS="-ea" mvn -s .github/workflows/mvn-settings.xml package -DCLDR_DIR=$(cd ../../../cldr/mine/src ; pwd) -DUNICODETOOLS_GEN_DIR=$(cd ../Generated ; pwd) -DUNICODETOOLS_REPO_DIR=$(pwd) -DUVERSION=$CURRENT_UVERSION + MAVEN_OPTS="-ea" mvn -s .github/workflows/mvn-settings.xml package -DCLDR_DIR=$(cd ../../../cldr/mine/src ; pwd) -DUNICODETOOLS_GEN_DIR=$(cd ../Generated ; pwd) -DUNICODETOOLS_REPO_DIR=$(pwd) -DUVERSION=$CURRENT_UVERSION -DEMIT_GITHUB_ERRORS env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java index 128e25f52..5020ce98c 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java @@ -47,6 +47,7 @@ public class TestUnicodeInvariants { static boolean doHtml = true; public static final String DEFAULT_FILE = "UnicodeInvariantTest.txt"; public static final HTMLTabber htmlTabber = new Tabber.HTMLTabber(); + public static final boolean EMIT_GITHUB_ERRORS = System.getProperty("EMIT_GITHUB_ERRORS") != null; private static final int // HELP1 = 0, @@ -850,15 +851,17 @@ private static void checkExpected( errorLister.setTabber(new Tabber.MonoTabber()); errorLister.setLineSeparator("\n"); errorLister.showSetNames(new PrintWriter(monoTable), segment); - System.err.println( - "::error file=unicodetools/src/main/resources/org/unicode/text/UCD/" - + DEFAULT_FILE - + ",line=" - + lineNumber - + ",title=Invariant test failure::" - + (String.join("\n", errorMessage) + "\n" + monoTable.toString()) - .replace("%", "%25") - .replace("\n", "%0A")); + if (EMIT_GITHUB_ERRORS) { + System.err.println( + "::error file=unicodetools/src/main/resources/org/unicode/text/UCD/" + + DEFAULT_FILE + + ",line=" + + lineNumber + + ",title=Invariant test failure::" + + (String.join("\n", errorMessage) + "\n" + monoTable.toString()) + .replace("%", "%25") + .replace("\n", "%0A")); + } errorLister.setTabber(htmlTabber); if (doHtml) { out.println("
"); From 4ca339058c4b1115be18d2031b267fa0ef096444 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 10 Jan 2024 14:57:50 +0100 Subject: [PATCH 06/15] We have screwed up since the beginning of time. --- .../org/unicode/text/UCD/TestUnicodeInvariants.java | 3 ++- .../org/unicode/text/UCD/UnicodeInvariantTest.txt | 11 ++++++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java index 5020ce98c..6b95372ad 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java @@ -47,7 +47,8 @@ public class TestUnicodeInvariants { static boolean doHtml = true; public static final String DEFAULT_FILE = "UnicodeInvariantTest.txt"; public static final HTMLTabber htmlTabber = new Tabber.HTMLTabber(); - public static final boolean EMIT_GITHUB_ERRORS = System.getProperty("EMIT_GITHUB_ERRORS") != null; + public static final boolean EMIT_GITHUB_ERRORS = + System.getProperty("EMIT_GITHUB_ERRORS") != null; private static final int // HELP1 = 0, diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt index 8a53f0eae..b8f79fbe1 100644 --- a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt +++ b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt @@ -750,12 +750,17 @@ Let $PostBaseSpacingMarks_Missed = [] # extended) with canonical equivalence. # Non-starters are GCB=Extend or GCB=SpacingMark, so that GB9 and GB9a keep # together any sequences that may be reordered by the Canonical Ordering -# Algorithm. -\P{U15.1.0:ccc=0} ⊆ [\p{U15.1.0:GCB=Extend}\p{U15.1.0:GCB=SpacingMark}] +# Algorithm. This has been true ever since Extended Grapheme Clusters were +# added. +\P{U5.1.0:ccc=0} ⊆ [\p{U5.1.0:GCB=Extend}\p{U5.1.0:GCB=SpacingMark}] \P{ccc=0} ⊆ [\p{GCB=Extend}\p{GCB=SpacingMark}] # Non-starters are actually GCB=Extend, so that GB9 alone does the job, since # there is no GB9a in legacy grapheme clusters. -# But not before Unicode Version 16.0, oops (see L2/24-009). +# But not before Unicode Version 16.0, even though we were saying so since +# Unicode Version 4.0 (https://www.unicode.org/reports/tr29/tr29-4.html#Implementation_Notes), +# oops (see L2/24-009). +\P{U4.0.0:ccc=0} ⊆ \p{U4.0.0:Grapheme_Extend} +\P{U4.1.0:ccc=0} ⊆ \p{U4.1.0:GCB=Extend} \P{U15.1.0:ccc=0} ⊆ \p{U15.1.0:GCB=Extend} \P{ccc=0} ⊆ \p{GCB=Extend} From 80acf018b13ab0126d10b11822af3aa0aa174a01 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 10 Jan 2024 16:13:07 +0100 Subject: [PATCH 07/15] Revert invariant tests --- .../unicode/text/UCD/UnicodeInvariantTest.txt | 31 ------------------- 1 file changed, 31 deletions(-) diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt index b8f79fbe1..9fa0ca3a0 100644 --- a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt +++ b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt @@ -746,37 +746,6 @@ Let $PostBaseSpacingMarks_Tweak = [\u103B \u1056 \u1057 \u1A57 \u1A6D] Let $PostBaseSpacingMarks_Missed = [] [$PostBaseSpacingMarks_All - $PostBaseSpacingMarks_Tweak - $PostBaseSpacingMarks_Missed] ⊂ [:GCB=XX:] -# Check the consistency of grapheme cluster segmentation (both legacy and -# extended) with canonical equivalence. -# Non-starters are GCB=Extend or GCB=SpacingMark, so that GB9 and GB9a keep -# together any sequences that may be reordered by the Canonical Ordering -# Algorithm. This has been true ever since Extended Grapheme Clusters were -# added. -\P{U5.1.0:ccc=0} ⊆ [\p{U5.1.0:GCB=Extend}\p{U5.1.0:GCB=SpacingMark}] -\P{ccc=0} ⊆ [\p{GCB=Extend}\p{GCB=SpacingMark}] -# Non-starters are actually GCB=Extend, so that GB9 alone does the job, since -# there is no GB9a in legacy grapheme clusters. -# But not before Unicode Version 16.0, even though we were saying so since -# Unicode Version 4.0 (https://www.unicode.org/reports/tr29/tr29-4.html#Implementation_Notes), -# oops (see L2/24-009). -\P{U4.0.0:ccc=0} ⊆ \p{U4.0.0:Grapheme_Extend} -\P{U4.1.0:ccc=0} ⊆ \p{U4.1.0:GCB=Extend} -\P{U15.1.0:ccc=0} ⊆ \p{U15.1.0:GCB=Extend} -\P{ccc=0} ⊆ \p{GCB=Extend} - -# Characters that appear in non-initial position in the canonical decomposition -# of another character are either Extend, V, or T, so that sequences that are -# equivalent to a canonical composite are kept together by GB6..GB9. -# We only look at the starters, since we dealt with non-starters above. -# Characters that appear in non-initial position in the canonical decomposition -# of a primary composite are NFC_QC=Maybe. We would need to separately check -# the characters that appear in non-initial position in the canonical -# decomposition of a full composition exclusion. -# We would also need to separately check that the characters are T or V only -# appear in canonical decompositions where they follow an LV, LVT, V, or T, or -# an LV or V, respectively. -[\p{NFC_QC=Maybe}&\p{ccc=0}] ⊆ [\p{GCB=Extend}\p{GCB=T}\p{GCB=V}] - ########################## # Emoji ########################## From 93c95708fc924aa7a188b28176148672801e9331 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 10 Jan 2024 16:38:41 +0100 Subject: [PATCH 08/15] Report various kinds of errors --- .../text/UCD/TestUnicodeInvariants.java | 62 +++++++++++++------ 1 file changed, 44 insertions(+), 18 deletions(-) diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java index 6b95372ad..3ee63e6e8 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java @@ -279,7 +279,6 @@ static class PropertyComparison { private static void equivalencesLine(String line, ParsePosition pp, int lineNumber) throws ParseException { - // TODO(egg): ::error etc. pp.setIndex("EquivalencesOf".length()); final UnicodeSet domain = new UnicodeSet(line, pp, symbolTable); final var leftProperty = CompoundProperty.of(LATEST_PROPS, line, pp); @@ -438,16 +437,24 @@ private static void equivalencesLine(String line, ParsePosition pp, int lineNumb ++testFailureCount; printErrorLine("Test Failure", Side.START, testFailureCount); } + final List errorMessage = new ArrayList<>(); if (counterexamples.isEmpty()) { - println("There are no counterexamples to " + relationOperator + "."); + errorMessage.add("There are no counterexamples to " + relationOperator + "."); } else { if (leftShouldImplyRight) { - println("The implication ⇒ is " + leftImpliesRightCounterexamples.isEmpty() + "."); + errorMessage.add( + "The implication ⇒ is " + leftImpliesRightCounterexamples.isEmpty() + "."); } if (rightShouldImplyLeft) { - println("The implication ⇐ is " + rightImpliesLeftCounterexamples.isEmpty() + "."); + errorMessage.add( + "The implication ⇐ is " + rightImpliesLeftCounterexamples.isEmpty() + "."); } } + for (var errorLine : errorMessage) { + println(errorLine); + } + errorMessage.addAll(counterexamples); + reportTestFailure(lineNumber, String.join("\n", errorMessage)); out.println(failure ? "
" : "
"); for (String counterexample : counterexamples) { out.println("
"); @@ -481,13 +488,23 @@ private static void inLine(ParsePosition pp, String line, int lineNumber) testFailureCount++; printErrorLine("Test Failure", Side.START, testFailureCount); // TODO(egg): ::error etc. - println( - "## Got unexpected " + String errorMessage = + "Got unexpected " + (propertyComparison.shouldBeEqual ? "differences" : "equalities") + ": " - + failureCount); + + failureCount; + println("## " + errorMessage); + final UnicodeLabel failureProp = new UnicodeProperty.UnicodeMapProperty().set(failures); errorLister.setValueSource(failureProp); + + var monoTable = new StringWriter(); + errorLister.setTabber(new Tabber.MonoTabber()); + errorLister.setLineSeparator("\n"); + errorLister.showSetNames(new PrintWriter(monoTable), failureSet); + errorLister.setTabber(htmlTabber); + reportTestFailure(lineNumber, errorMessage + "\n" + monoTable.toString()); + if (doHtml) { out.println(""); } @@ -852,17 +869,8 @@ private static void checkExpected( errorLister.setTabber(new Tabber.MonoTabber()); errorLister.setLineSeparator("\n"); errorLister.showSetNames(new PrintWriter(monoTable), segment); - if (EMIT_GITHUB_ERRORS) { - System.err.println( - "::error file=unicodetools/src/main/resources/org/unicode/text/UCD/" - + DEFAULT_FILE - + ",line=" - + lineNumber - + ",title=Invariant test failure::" - + (String.join("\n", errorMessage) + "\n" + monoTable.toString()) - .replace("%", "%25") - .replace("\n", "%0A")); - } + reportTestFailure( + lineNumber, String.join("\n", errorMessage) + "\n" + monoTable.toString()); errorLister.setTabber(htmlTabber); if (doHtml) { out.println("
"); @@ -1159,6 +1167,24 @@ private static void println() { println(""); } + private static void reportTestFailure(int lineNumber, String message) { + reportError(lineNumber, "Invariant test failure", message); + } + + private static void reportError(int lineNumber, String title, String message) { + if (EMIT_GITHUB_ERRORS) { + System.err.println( + "::error file=unicodetools/src/main/resources/org/unicode/text/UCD/" + + DEFAULT_FILE + + ",line=" + + lineNumber + + ",title=" + + title + + "::" + + message.replace("%", "%25").replace("\n", "%0A")); + } + } + /** Should add to UnicodeSet */ public static String scan(UnicodeSet unicodeSet, String line, ParsePosition pp, boolean in) { final int start = pp.getIndex(); From 281f70bcc119c297e1aa9cbdfb5449ecea5131c3 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 10 Jan 2024 16:56:03 +0100 Subject: [PATCH 09/15] report parse errors --- .../java/org/unicode/text/UCD/TestUnicodeInvariants.java | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java index 3ee63e6e8..74beb4a0c 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java @@ -487,7 +487,6 @@ private static void inLine(ParsePosition pp, String line, int lineNumber) if (failureCount != 0) { testFailureCount++; printErrorLine("Test Failure", Side.START, testFailureCount); - // TODO(egg): ::error etc. String errorMessage = "Got unexpected " + (propertyComparison.shouldBeEqual ? "differences" : "equalities") @@ -1075,6 +1074,10 @@ private static int parseError(int parseErrorCount, String line, Exception e, int println("##" + message); } e.printStackTrace(out); + StringWriter w = new StringWriter().append(message).append('\n'); + e.printStackTrace(new PrintWriter(w)); + reportParseError(lineNumber, w.toString()); + out.println(""); printErrorLine("Parse Error", Side.END, parseErrorCount); println(); @@ -1167,6 +1170,10 @@ private static void println() { println(""); } + private static void reportParseError(int lineNumber, String message) { + reportError(lineNumber, "Parse error", message); + } + private static void reportTestFailure(int lineNumber, String message) { reportError(lineNumber, "Invariant test failure", message); } From 70a4ee2bdb98401b4b35c6d7c882911455199d3c Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 10 Jan 2024 16:56:28 +0100 Subject: [PATCH 10/15] Break everything --- unicodetools/data/ucd/dev/CaseFolding.txt | 3 --- unicodetools/data/ucd/dev/DerivedCoreProperties.txt | 6 +++--- unicodetools/data/ucd/dev/PropList.txt | 6 +++--- unicodetools/data/ucd/dev/UnicodeData.txt | 2 +- unicodetools/data/ucd/dev/extracted/DerivedName.txt | 4 ++-- .../main/java/org/unicode/text/UCD/GenerateCaseFolding.java | 6 +++--- .../resources/org/unicode/text/UCD/UnicodeInvariantTest.txt | 2 +- 7 files changed, 13 insertions(+), 16 deletions(-) diff --git a/unicodetools/data/ucd/dev/CaseFolding.txt b/unicodetools/data/ucd/dev/CaseFolding.txt index 2f1047b36..d0c2c69af 100644 --- a/unicodetools/data/ucd/dev/CaseFolding.txt +++ b/unicodetools/data/ucd/dev/CaseFolding.txt @@ -930,7 +930,6 @@ 1FCC; S; 1FC3; # GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI 1FD2; F; 03B9 0308 0300; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND VARIA 1FD3; F; 03B9 0308 0301; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA -1FD3; S; 0390; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA 1FD6; F; 03B9 0342; # GREEK SMALL LETTER IOTA WITH PERISPOMENI 1FD7; F; 03B9 0308 0342; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND PERISPOMENI 1FD8; C; 1FD0; # GREEK CAPITAL LETTER IOTA WITH VRACHY @@ -939,7 +938,6 @@ 1FDB; C; 1F77; # GREEK CAPITAL LETTER IOTA WITH OXIA 1FE2; F; 03C5 0308 0300; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND VARIA 1FE3; F; 03C5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA -1FE3; S; 03B0; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA 1FE4; F; 03C1 0313; # GREEK SMALL LETTER RHO WITH PSILI 1FE6; F; 03C5 0342; # GREEK SMALL LETTER UPSILON WITH PERISPOMENI 1FE7; F; 03C5 0308 0342; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND PERISPOMENI @@ -1335,7 +1333,6 @@ FB02; F; 0066 006C; # LATIN SMALL LIGATURE FL FB03; F; 0066 0066 0069; # LATIN SMALL LIGATURE FFI FB04; F; 0066 0066 006C; # LATIN SMALL LIGATURE FFL FB05; F; 0073 0074; # LATIN SMALL LIGATURE LONG S T -FB05; S; FB06; # LATIN SMALL LIGATURE LONG S T FB06; F; 0073 0074; # LATIN SMALL LIGATURE ST FB13; F; 0574 0576; # ARMENIAN SMALL LIGATURE MEN NOW FB14; F; 0574 0565; # ARMENIAN SMALL LIGATURE MEN ECH diff --git a/unicodetools/data/ucd/dev/DerivedCoreProperties.txt b/unicodetools/data/ucd/dev/DerivedCoreProperties.txt index 118350283..ec2af4ada 100644 --- a/unicodetools/data/ucd/dev/DerivedCoreProperties.txt +++ b/unicodetools/data/ucd/dev/DerivedCoreProperties.txt @@ -1,5 +1,5 @@ # DerivedCoreProperties-16.0.0.txt -# Date: 2023-11-10, 22:25:26 GMT +# Date: 2024-01-10, 15:40:42 GMT # © 2023 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see https://www.unicode.org/terms_of_use.html @@ -726,7 +726,7 @@ FFE9..FFEC ; Math # Sm [4] HALFWIDTH LEFTWARDS ARROW..HALFWIDTH DOWNWARDS A 1D78 ; Alphabetic # Lm MODIFIER LETTER CYRILLIC EN 1D79..1D9A ; Alphabetic # L& [34] LATIN SMALL LETTER INSULAR G..LATIN SMALL LETTER EZH WITH RETROFLEX HOOK 1D9B..1DBF ; Alphabetic # Lm [37] MODIFIER LETTER SMALL TURNED ALPHA..MODIFIER LETTER SMALL THETA -1DD3..1DF4 ; Alphabetic # Mn [34] COMBINING LATIN SMALL LETTER FLATTENED OPEN A ABOVE..COMBINING LATIN SMALL LETTER U WITH DIAERESIS +1DE7..1DF4 ; Alphabetic # Mn [14] COMBINING LATIN SMALL LETTER ALPHA..COMBINING LATIN SMALL LETTER U WITH DIAERESIS 1E00..1F15 ; Alphabetic # L& [278] LATIN CAPITAL LETTER A WITH RING BELOW..GREEK SMALL LETTER EPSILON WITH DASIA AND OXIA 1F18..1F1D ; Alphabetic # L& [6] GREEK CAPITAL LETTER EPSILON WITH PSILI..GREEK CAPITAL LETTER EPSILON WITH DASIA AND OXIA 1F20..1F45 ; Alphabetic # L& [38] GREEK SMALL LETTER ETA WITH PSILI..GREEK SMALL LETTER OMICRON WITH DASIA AND OXIA @@ -1440,7 +1440,7 @@ FFDA..FFDC ; Alphabetic # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANG 30000..3134A ; Alphabetic # Lo [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A 31350..323AF ; Alphabetic # Lo [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF -# Total code points: 138766 +# Total code points: 138746 # ================================================ diff --git a/unicodetools/data/ucd/dev/PropList.txt b/unicodetools/data/ucd/dev/PropList.txt index 6225e4c33..44992a259 100644 --- a/unicodetools/data/ucd/dev/PropList.txt +++ b/unicodetools/data/ucd/dev/PropList.txt @@ -1,5 +1,5 @@ # PropList-16.0.0.txt -# Date: 2023-11-10, 22:06:29 GMT +# Date: 2024-01-10, 15:41:07 GMT # © 2023 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see https://www.unicode.org/terms_of_use.html @@ -637,7 +637,7 @@ FF41..FF46 ; Hex_Digit # L& [6] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH L 1C2C..1C33 ; Other_Alphabetic # Mn [8] LEPCHA VOWEL SIGN E..LEPCHA CONSONANT SIGN T 1C34..1C35 ; Other_Alphabetic # Mc [2] LEPCHA CONSONANT SIGN NYIN-DO..LEPCHA CONSONANT SIGN KANG 1C36 ; Other_Alphabetic # Mn LEPCHA SIGN RAN -1DD3..1DF4 ; Other_Alphabetic # Mn [34] COMBINING LATIN SMALL LETTER FLATTENED OPEN A ABOVE..COMBINING LATIN SMALL LETTER U WITH DIAERESIS +1DE7..1DF4 ; Other_Alphabetic # Mn [14] COMBINING LATIN SMALL LETTER ALPHA..COMBINING LATIN SMALL LETTER U WITH DIAERESIS 24B6..24E9 ; Other_Alphabetic # So [52] CIRCLED LATIN CAPITAL LETTER A..CIRCLED LATIN SMALL LETTER Z 2DE0..2DFF ; Other_Alphabetic # Mn [32] COMBINING CYRILLIC LETTER BE..COMBINING CYRILLIC LETTER IOTIFIED BIG YUS A674..A67B ; Other_Alphabetic # Mn [8] COMBINING CYRILLIC LETTER UKRAINIAN IE..COMBINING CYRILLIC LETTER OMEGA @@ -850,7 +850,7 @@ FB1E ; Other_Alphabetic # Mn HEBREW POINT JUDEO-SPANISH VARIKA 1F150..1F169 ; Other_Alphabetic # So [26] NEGATIVE CIRCLED LATIN CAPITAL LETTER A..NEGATIVE CIRCLED LATIN CAPITAL LETTER Z 1F170..1F189 ; Other_Alphabetic # So [26] NEGATIVE SQUARED LATIN CAPITAL LETTER A..NEGATIVE SQUARED LATIN CAPITAL LETTER Z -# Total code points: 1495 +# Total code points: 1475 # ================================================ diff --git a/unicodetools/data/ucd/dev/UnicodeData.txt b/unicodetools/data/ucd/dev/UnicodeData.txt index 4376ff723..865882b95 100644 --- a/unicodetools/data/ucd/dev/UnicodeData.txt +++ b/unicodetools/data/ucd/dev/UnicodeData.txt @@ -23591,7 +23591,7 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;; 130DD;EGYPTIAN HIEROGLYPH E010;Lo;0;L;;;;;N;;;;; 130DE;EGYPTIAN HIEROGLYPH E011;Lo;0;L;;;;;N;;;;; 130DF;EGYPTIAN HIEROGLYPH E012;Lo;0;L;;;;;N;;;;; -130E0;EGYPTIAN HIEROGLYPH E013;Lo;0;L;;;;;N;;;;; +130E0;MEOW;Lo;0;L;;;;;N;;;;; 130E1;EGYPTIAN HIEROGLYPH E014;Lo;0;L;;;;;N;;;;; 130E2;EGYPTIAN HIEROGLYPH E015;Lo;0;L;;;;;N;;;;; 130E3;EGYPTIAN HIEROGLYPH E016;Lo;0;L;;;;;N;;;;; diff --git a/unicodetools/data/ucd/dev/extracted/DerivedName.txt b/unicodetools/data/ucd/dev/extracted/DerivedName.txt index 145e66ed9..31b320901 100644 --- a/unicodetools/data/ucd/dev/extracted/DerivedName.txt +++ b/unicodetools/data/ucd/dev/extracted/DerivedName.txt @@ -1,5 +1,5 @@ # DerivedName-16.0.0.txt -# Date: 2023-11-10, 22:25:34 GMT +# Date: 2024-01-10, 15:40:50 GMT # © 2023 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see https://www.unicode.org/terms_of_use.html @@ -34256,7 +34256,7 @@ FFFD ; REPLACEMENT CHARACTER 130DD ; EGYPTIAN HIEROGLYPH E010 130DE ; EGYPTIAN HIEROGLYPH E011 130DF ; EGYPTIAN HIEROGLYPH E012 -130E0 ; EGYPTIAN HIEROGLYPH E013 +130E0 ; MEOW 130E1 ; EGYPTIAN HIEROGLYPH E014 130E2 ; EGYPTIAN HIEROGLYPH E015 130E3 ; EGYPTIAN HIEROGLYPH E016 diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/GenerateCaseFolding.java b/unicodetools/src/main/java/org/unicode/text/UCD/GenerateCaseFolding.java index 93e05b25d..a63659aff 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/GenerateCaseFolding.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/GenerateCaseFolding.java @@ -245,14 +245,14 @@ static void drawLine( // ΐ → ΐ // GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA → // GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS - 0x1FD3, 0x0390, + // 0x1FD3, 0x0390, // ΰ → ΰ // GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA → // GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS - 0x1FE3, 0x03B0, + // 0x1FE3, 0x03B0, // ſt → st // LATIN SMALL LIGATURE LONG S T → LATIN SMALL LIGATURE ST - 0xFB05, 0xFB06 + // 0xFB05, 0xFB06 }; private static Map getCaseFolding( diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt index 9fa0ca3a0..dcb2601a0 100644 --- a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt +++ b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt @@ -286,7 +286,7 @@ Let $gcMn_bcL = [\u0CBF\u0CC6\U00011A07\U00011A08\U00011C3F] # Stability: The Case_Folding property value is limited so that no string when case folded expands to more than 3× in length (measured in code units). \p{cf=/..../} = [] # Note: This bound is tight: -\p{cf=/.../} ⊃ [] +\p{cf=/.../} ⊋ [] # Case folding is not the same as lowercasing: Cherokee case folds to uppercase. In \p{sc=Cher} cf = uc From 03b74f241ad1b7978773ca4d357c6a09c48ccd39 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 10 Jan 2024 17:07:48 +0100 Subject: [PATCH 11/15] make it a bit more readable hopefully --- .../java/org/unicode/text/UCD/TestUnicodeInvariants.java | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java index 74beb4a0c..bfd2a212f 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java @@ -454,7 +454,7 @@ private static void equivalencesLine(String line, ParsePosition pp, int lineNumb println(errorLine); } errorMessage.addAll(counterexamples); - reportTestFailure(lineNumber, String.join("\n", errorMessage)); + reportTestFailure(lineNumber, String.join("\n", errorMessage).replace('\t', ' ')); out.println(failure ? "
" : "
"); for (String counterexample : counterexamples) { out.println("
"); @@ -1073,10 +1073,8 @@ private static int parseError(int parseErrorCount, String line, Exception e, int if (message != null) { println("##" + message); } + reportParseError(lineNumber, message); e.printStackTrace(out); - StringWriter w = new StringWriter().append(message).append('\n'); - e.printStackTrace(new PrintWriter(w)); - reportParseError(lineNumber, w.toString()); out.println(""); printErrorLine("Parse Error", Side.END, parseErrorCount); From 4ac8e8451b6b2757b62d8f046faee4b90eb974b2 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 10 Jan 2024 17:33:18 +0100 Subject: [PATCH 12/15] Revert "Break everything" This reverts commit 70a4ee2bdb98401b4b35c6d7c882911455199d3c. --- unicodetools/data/ucd/dev/CaseFolding.txt | 3 +++ unicodetools/data/ucd/dev/DerivedCoreProperties.txt | 6 +++--- unicodetools/data/ucd/dev/PropList.txt | 6 +++--- unicodetools/data/ucd/dev/UnicodeData.txt | 2 +- unicodetools/data/ucd/dev/extracted/DerivedName.txt | 4 ++-- .../main/java/org/unicode/text/UCD/GenerateCaseFolding.java | 6 +++--- .../resources/org/unicode/text/UCD/UnicodeInvariantTest.txt | 2 +- 7 files changed, 16 insertions(+), 13 deletions(-) diff --git a/unicodetools/data/ucd/dev/CaseFolding.txt b/unicodetools/data/ucd/dev/CaseFolding.txt index d0c2c69af..2f1047b36 100644 --- a/unicodetools/data/ucd/dev/CaseFolding.txt +++ b/unicodetools/data/ucd/dev/CaseFolding.txt @@ -930,6 +930,7 @@ 1FCC; S; 1FC3; # GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI 1FD2; F; 03B9 0308 0300; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND VARIA 1FD3; F; 03B9 0308 0301; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA +1FD3; S; 0390; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA 1FD6; F; 03B9 0342; # GREEK SMALL LETTER IOTA WITH PERISPOMENI 1FD7; F; 03B9 0308 0342; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND PERISPOMENI 1FD8; C; 1FD0; # GREEK CAPITAL LETTER IOTA WITH VRACHY @@ -938,6 +939,7 @@ 1FDB; C; 1F77; # GREEK CAPITAL LETTER IOTA WITH OXIA 1FE2; F; 03C5 0308 0300; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND VARIA 1FE3; F; 03C5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA +1FE3; S; 03B0; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA 1FE4; F; 03C1 0313; # GREEK SMALL LETTER RHO WITH PSILI 1FE6; F; 03C5 0342; # GREEK SMALL LETTER UPSILON WITH PERISPOMENI 1FE7; F; 03C5 0308 0342; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND PERISPOMENI @@ -1333,6 +1335,7 @@ FB02; F; 0066 006C; # LATIN SMALL LIGATURE FL FB03; F; 0066 0066 0069; # LATIN SMALL LIGATURE FFI FB04; F; 0066 0066 006C; # LATIN SMALL LIGATURE FFL FB05; F; 0073 0074; # LATIN SMALL LIGATURE LONG S T +FB05; S; FB06; # LATIN SMALL LIGATURE LONG S T FB06; F; 0073 0074; # LATIN SMALL LIGATURE ST FB13; F; 0574 0576; # ARMENIAN SMALL LIGATURE MEN NOW FB14; F; 0574 0565; # ARMENIAN SMALL LIGATURE MEN ECH diff --git a/unicodetools/data/ucd/dev/DerivedCoreProperties.txt b/unicodetools/data/ucd/dev/DerivedCoreProperties.txt index ec2af4ada..118350283 100644 --- a/unicodetools/data/ucd/dev/DerivedCoreProperties.txt +++ b/unicodetools/data/ucd/dev/DerivedCoreProperties.txt @@ -1,5 +1,5 @@ # DerivedCoreProperties-16.0.0.txt -# Date: 2024-01-10, 15:40:42 GMT +# Date: 2023-11-10, 22:25:26 GMT # © 2023 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see https://www.unicode.org/terms_of_use.html @@ -726,7 +726,7 @@ FFE9..FFEC ; Math # Sm [4] HALFWIDTH LEFTWARDS ARROW..HALFWIDTH DOWNWARDS A 1D78 ; Alphabetic # Lm MODIFIER LETTER CYRILLIC EN 1D79..1D9A ; Alphabetic # L& [34] LATIN SMALL LETTER INSULAR G..LATIN SMALL LETTER EZH WITH RETROFLEX HOOK 1D9B..1DBF ; Alphabetic # Lm [37] MODIFIER LETTER SMALL TURNED ALPHA..MODIFIER LETTER SMALL THETA -1DE7..1DF4 ; Alphabetic # Mn [14] COMBINING LATIN SMALL LETTER ALPHA..COMBINING LATIN SMALL LETTER U WITH DIAERESIS +1DD3..1DF4 ; Alphabetic # Mn [34] COMBINING LATIN SMALL LETTER FLATTENED OPEN A ABOVE..COMBINING LATIN SMALL LETTER U WITH DIAERESIS 1E00..1F15 ; Alphabetic # L& [278] LATIN CAPITAL LETTER A WITH RING BELOW..GREEK SMALL LETTER EPSILON WITH DASIA AND OXIA 1F18..1F1D ; Alphabetic # L& [6] GREEK CAPITAL LETTER EPSILON WITH PSILI..GREEK CAPITAL LETTER EPSILON WITH DASIA AND OXIA 1F20..1F45 ; Alphabetic # L& [38] GREEK SMALL LETTER ETA WITH PSILI..GREEK SMALL LETTER OMICRON WITH DASIA AND OXIA @@ -1440,7 +1440,7 @@ FFDA..FFDC ; Alphabetic # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANG 30000..3134A ; Alphabetic # Lo [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A 31350..323AF ; Alphabetic # Lo [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF -# Total code points: 138746 +# Total code points: 138766 # ================================================ diff --git a/unicodetools/data/ucd/dev/PropList.txt b/unicodetools/data/ucd/dev/PropList.txt index 44992a259..6225e4c33 100644 --- a/unicodetools/data/ucd/dev/PropList.txt +++ b/unicodetools/data/ucd/dev/PropList.txt @@ -1,5 +1,5 @@ # PropList-16.0.0.txt -# Date: 2024-01-10, 15:41:07 GMT +# Date: 2023-11-10, 22:06:29 GMT # © 2023 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see https://www.unicode.org/terms_of_use.html @@ -637,7 +637,7 @@ FF41..FF46 ; Hex_Digit # L& [6] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH L 1C2C..1C33 ; Other_Alphabetic # Mn [8] LEPCHA VOWEL SIGN E..LEPCHA CONSONANT SIGN T 1C34..1C35 ; Other_Alphabetic # Mc [2] LEPCHA CONSONANT SIGN NYIN-DO..LEPCHA CONSONANT SIGN KANG 1C36 ; Other_Alphabetic # Mn LEPCHA SIGN RAN -1DE7..1DF4 ; Other_Alphabetic # Mn [14] COMBINING LATIN SMALL LETTER ALPHA..COMBINING LATIN SMALL LETTER U WITH DIAERESIS +1DD3..1DF4 ; Other_Alphabetic # Mn [34] COMBINING LATIN SMALL LETTER FLATTENED OPEN A ABOVE..COMBINING LATIN SMALL LETTER U WITH DIAERESIS 24B6..24E9 ; Other_Alphabetic # So [52] CIRCLED LATIN CAPITAL LETTER A..CIRCLED LATIN SMALL LETTER Z 2DE0..2DFF ; Other_Alphabetic # Mn [32] COMBINING CYRILLIC LETTER BE..COMBINING CYRILLIC LETTER IOTIFIED BIG YUS A674..A67B ; Other_Alphabetic # Mn [8] COMBINING CYRILLIC LETTER UKRAINIAN IE..COMBINING CYRILLIC LETTER OMEGA @@ -850,7 +850,7 @@ FB1E ; Other_Alphabetic # Mn HEBREW POINT JUDEO-SPANISH VARIKA 1F150..1F169 ; Other_Alphabetic # So [26] NEGATIVE CIRCLED LATIN CAPITAL LETTER A..NEGATIVE CIRCLED LATIN CAPITAL LETTER Z 1F170..1F189 ; Other_Alphabetic # So [26] NEGATIVE SQUARED LATIN CAPITAL LETTER A..NEGATIVE SQUARED LATIN CAPITAL LETTER Z -# Total code points: 1475 +# Total code points: 1495 # ================================================ diff --git a/unicodetools/data/ucd/dev/UnicodeData.txt b/unicodetools/data/ucd/dev/UnicodeData.txt index 865882b95..4376ff723 100644 --- a/unicodetools/data/ucd/dev/UnicodeData.txt +++ b/unicodetools/data/ucd/dev/UnicodeData.txt @@ -23591,7 +23591,7 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;; 130DD;EGYPTIAN HIEROGLYPH E010;Lo;0;L;;;;;N;;;;; 130DE;EGYPTIAN HIEROGLYPH E011;Lo;0;L;;;;;N;;;;; 130DF;EGYPTIAN HIEROGLYPH E012;Lo;0;L;;;;;N;;;;; -130E0;MEOW;Lo;0;L;;;;;N;;;;; +130E0;EGYPTIAN HIEROGLYPH E013;Lo;0;L;;;;;N;;;;; 130E1;EGYPTIAN HIEROGLYPH E014;Lo;0;L;;;;;N;;;;; 130E2;EGYPTIAN HIEROGLYPH E015;Lo;0;L;;;;;N;;;;; 130E3;EGYPTIAN HIEROGLYPH E016;Lo;0;L;;;;;N;;;;; diff --git a/unicodetools/data/ucd/dev/extracted/DerivedName.txt b/unicodetools/data/ucd/dev/extracted/DerivedName.txt index 31b320901..145e66ed9 100644 --- a/unicodetools/data/ucd/dev/extracted/DerivedName.txt +++ b/unicodetools/data/ucd/dev/extracted/DerivedName.txt @@ -1,5 +1,5 @@ # DerivedName-16.0.0.txt -# Date: 2024-01-10, 15:40:50 GMT +# Date: 2023-11-10, 22:25:34 GMT # © 2023 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see https://www.unicode.org/terms_of_use.html @@ -34256,7 +34256,7 @@ FFFD ; REPLACEMENT CHARACTER 130DD ; EGYPTIAN HIEROGLYPH E010 130DE ; EGYPTIAN HIEROGLYPH E011 130DF ; EGYPTIAN HIEROGLYPH E012 -130E0 ; MEOW +130E0 ; EGYPTIAN HIEROGLYPH E013 130E1 ; EGYPTIAN HIEROGLYPH E014 130E2 ; EGYPTIAN HIEROGLYPH E015 130E3 ; EGYPTIAN HIEROGLYPH E016 diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/GenerateCaseFolding.java b/unicodetools/src/main/java/org/unicode/text/UCD/GenerateCaseFolding.java index a63659aff..93e05b25d 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/GenerateCaseFolding.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/GenerateCaseFolding.java @@ -245,14 +245,14 @@ static void drawLine( // ΐ → ΐ // GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA → // GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS - // 0x1FD3, 0x0390, + 0x1FD3, 0x0390, // ΰ → ΰ // GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA → // GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS - // 0x1FE3, 0x03B0, + 0x1FE3, 0x03B0, // ſt → st // LATIN SMALL LIGATURE LONG S T → LATIN SMALL LIGATURE ST - // 0xFB05, 0xFB06 + 0xFB05, 0xFB06 }; private static Map getCaseFolding( diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt index dcb2601a0..9fa0ca3a0 100644 --- a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt +++ b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt @@ -286,7 +286,7 @@ Let $gcMn_bcL = [\u0CBF\u0CC6\U00011A07\U00011A08\U00011C3F] # Stability: The Case_Folding property value is limited so that no string when case folded expands to more than 3× in length (measured in code units). \p{cf=/..../} = [] # Note: This bound is tight: -\p{cf=/.../} ⊋ [] +\p{cf=/.../} ⊃ [] # Case folding is not the same as lowercasing: Cherokee case folds to uppercase. In \p{sc=Cher} cf = uc From 3ba16f5e45f911741e1464c843e88b11283eefd5 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 10 Jan 2024 17:52:05 +0100 Subject: [PATCH 13/15] It is only an error if it is not what we expect. --- .../java/org/unicode/text/UCD/TestUnicodeInvariants.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java index bfd2a212f..4093bc9ec 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java @@ -450,8 +450,10 @@ private static void equivalencesLine(String line, ParsePosition pp, int lineNumb "The implication ⇐ is " + rightImpliesLeftCounterexamples.isEmpty() + "."); } } - for (var errorLine : errorMessage) { - println(errorLine); + if (failure) { + for (var errorLine : errorMessage) { + println(errorLine); + } } errorMessage.addAll(counterexamples); reportTestFailure(lineNumber, String.join("\n", errorMessage).replace('\t', ' ')); From 008fa450f17e0ed4582081f41e729586ea116042 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 10 Jan 2024 18:18:56 +0100 Subject: [PATCH 14/15] Put the condition in the right place --- .../org/unicode/text/UCD/TestUnicodeInvariants.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java index 4093bc9ec..90d9d2b07 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java @@ -450,13 +450,13 @@ private static void equivalencesLine(String line, ParsePosition pp, int lineNumb "The implication ⇐ is " + rightImpliesLeftCounterexamples.isEmpty() + "."); } } - if (failure) { - for (var errorLine : errorMessage) { - println(errorLine); - } + for (var errorLine : errorMessage) { + println(errorLine); } errorMessage.addAll(counterexamples); - reportTestFailure(lineNumber, String.join("\n", errorMessage).replace('\t', ' ')); + if (failure) { + reportTestFailure(lineNumber, String.join("\n", errorMessage).replace('\t', ' ')); + } out.println(failure ? "" : "
"); for (String counterexample : counterexamples) { out.println("
"); From 47ae9c4f23134a36dd9f288b7421533f5fa15808 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Fri, 12 Jan 2024 23:09:49 +0100 Subject: [PATCH 15/15] Fehlermeldungszeilen --- .../text/UCD/TestUnicodeInvariants.java | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java index 90d9d2b07..9327e02ff 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java @@ -437,25 +437,25 @@ private static void equivalencesLine(String line, ParsePosition pp, int lineNumb ++testFailureCount; printErrorLine("Test Failure", Side.START, testFailureCount); } - final List errorMessage = new ArrayList<>(); + final List errorMessageLines = new ArrayList<>(); if (counterexamples.isEmpty()) { - errorMessage.add("There are no counterexamples to " + relationOperator + "."); + errorMessageLines.add("There are no counterexamples to " + relationOperator + "."); } else { if (leftShouldImplyRight) { - errorMessage.add( + errorMessageLines.add( "The implication ⇒ is " + leftImpliesRightCounterexamples.isEmpty() + "."); } if (rightShouldImplyLeft) { - errorMessage.add( + errorMessageLines.add( "The implication ⇐ is " + rightImpliesLeftCounterexamples.isEmpty() + "."); } } - for (var errorLine : errorMessage) { + for (var errorLine : errorMessageLines) { println(errorLine); } - errorMessage.addAll(counterexamples); + errorMessageLines.addAll(counterexamples); if (failure) { - reportTestFailure(lineNumber, String.join("\n", errorMessage).replace('\t', ' ')); + reportTestFailure(lineNumber, String.join("\n", errorMessageLines).replace('\t', ' ')); } out.println(failure ? "" : "
"); for (String counterexample : counterexamples) { @@ -857,21 +857,21 @@ private static void checkExpected( } testFailureCount++; printErrorLine("Test Failure", Side.START, testFailureCount); - final var errorMessage = + final var errorMessageLines = new String[] { "Expected " + expected + ", got: " + segment.size() + "\t" + segment.toString(), rightStatus + "\t" + rightSide, leftStatus + "\t" + leftSide }; var monoTable = new StringWriter(); - for (String line : errorMessage) { + for (String line : errorMessageLines) { println("## " + line); } errorLister.setTabber(new Tabber.MonoTabber()); errorLister.setLineSeparator("\n"); errorLister.showSetNames(new PrintWriter(monoTable), segment); reportTestFailure( - lineNumber, String.join("\n", errorMessage) + "\n" + monoTable.toString()); + lineNumber, String.join("\n", errorMessageLines) + "\n" + monoTable.toString()); errorLister.setTabber(htmlTabber); if (doHtml) { out.println("
");