diff --git a/unicodetools/src/main/java/org/unicode/props/IndexUnicodeProperties.java b/unicodetools/src/main/java/org/unicode/props/IndexUnicodeProperties.java index e0f0cf172..ad1479668 100644 --- a/unicodetools/src/main/java/org/unicode/props/IndexUnicodeProperties.java +++ b/unicodetools/src/main/java/org/unicode/props/IndexUnicodeProperties.java @@ -471,6 +471,9 @@ public synchronized UnicodeMap load(UcdProperty prop2, boolean expectCac final String fileName = fileInfo.getFileName(ucdVersion); if (FILE_CACHE) { + // TODO(egg): When using cached property data, most defaults do not get + // loaded in PropertyParsingInfo, as that happens in parseSourceFile. + // Only the ones from the Extra files are loaded. data0 = getCachedMap(prop2, fullFilename); if (data0 != null) { property2UnicodeMap.put(prop2, data0.freeze()); diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java index 6bfdc0edf..320b7d120 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java @@ -17,9 +17,11 @@ import java.util.ArrayList; import java.util.Comparator; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Objects; +import java.util.Set; import java.util.TreeMap; import java.util.function.Function; import java.util.regex.Pattern; @@ -31,6 +33,8 @@ import org.unicode.jsp.ICUPropertyFactory; import org.unicode.props.BagFormatter; import org.unicode.props.IndexUnicodeProperties; +import org.unicode.props.IndexUnicodeProperties.DefaultValueType; +import org.unicode.props.UcdProperty; import org.unicode.props.UnicodeProperty; import org.unicode.props.UnicodeProperty.Factory; import org.unicode.text.utility.Settings; @@ -234,6 +238,8 @@ public static int testInvariants(String inputFile, String suffix, boolean doRang letLine(pp, line); } else if (line.startsWith("In")) { inLine(pp, line, inputFile, lineNumber); + } else if (line.startsWith("Propertywise")) { + propertywiseLine(pp, line, inputFile, lineNumber); } else if (line.startsWith("ShowScript")) { showScript = true; } else if (line.startsWith("HideScript")) { @@ -326,6 +332,84 @@ protected String getFailure(int codepoint) { } } + private static void propertywiseLine(ParsePosition pp, String line, String file, int lineNumber) + throws ParseException { + pp.setIndex("Propertywise".length()); + final UnicodeSet set = new UnicodeSet(line, pp, symbolTable); + if (set.hasStrings()) { + throw new ParseException( + "Set should contain only single code points for property comparison", + pp.getIndex()); + } + expectToken("AreAlike", pp, line); + if (pp.getIndex() < line.length()) { + expectToken(",", pp, line); + expectToken("Except", pp, line); + expectToken(":", pp, line); + } + Set excludedProperties = new HashSet<>(); + excludedProperties.add("Name"); + while (pp.getIndex() < line.length()) { + final int propertyNameStart = pp.getIndex(); + scan(PATTERN_WHITE_SPACE, line, pp, false); + excludedProperties.add(line.substring(propertyNameStart, pp.getIndex())); + scan(PATTERN_WHITE_SPACE, line, pp, true); + } + final var iup = IndexUnicodeProperties.make(Settings.latestVersion); + final List errorMessageLines = new ArrayList<>(); + for (var p : UcdProperty.values()) { + final var property = iup.getProperty(p); + if (property.getNameAliases().stream() + .anyMatch(alias -> excludedProperties.contains(alias))) { + continue; + } + final int first = set.charAt(0); + String p1 = property.getValue(first); + for (var range : set.ranges()) { + for (int c = range.codepoint; c <= range.codepointEnd; ++c) { + if (c == first) { + continue; + } + String p2 = property.getValue(c); + if (!Objects.equals(p1, p2)) { + if (IndexUnicodeProperties.getResolvedDefaultValueType(p) + != DefaultValueType.CODE_POINT + || !p1.equals(Character.toString(first)) + || !p2.equals(Character.toString(c))) { + errorMessageLines.add( + property.getName() + + "(" + + Character.toString(first) + + ")\t=\t" + + p1 + + "\t≠\t" + + p2 + + "\t=\t" + + property.getName() + + "(" + + Character.toString(c) + + ")"); + } + } + } + } + } + if (!errorMessageLines.isEmpty()) { + testFailureCount++; + printErrorLine("Test Failure", Side.START, testFailureCount); + reportTestFailure( + file, lineNumber, String.join("\n", errorMessageLines).replace('\t', ' ')); + out.println(""); + for (String errorMessageLine : errorMessageLines) { + out.println(""); + } + out.println("
"); + out.println(toHTML.transform(errorMessageLine).replace("\t", "")); + out.println("
"); + printErrorLine("Test Failure", Side.END, testFailureCount); + } + } + private static void equivalencesLine(String line, ParsePosition pp, String file, int lineNumber) throws ParseException { pp.setIndex("OnPairsOf".length()); diff --git a/unicodetools/src/main/resources/org/unicode/props/ExtraPropertyValueAliases.txt b/unicodetools/src/main/resources/org/unicode/props/ExtraPropertyValueAliases.txt index 465c613c6..98613a31c 100644 --- a/unicodetools/src/main/resources/org/unicode/props/ExtraPropertyValueAliases.txt +++ b/unicodetools/src/main/resources/org/unicode/props/ExtraPropertyValueAliases.txt @@ -101,9 +101,15 @@ # Overrides for bugs # TODO(egg): These are specified in their respective files, we should not need them here. + # @missing: 0000..10FFFF; Bidi_Mirroring_Glyph; # @missing: 0000..10FFFF; Equivalent_Unified_Ideograph; +# At least the following two appear to be needed because of issues related to caching; +# See comments in IndexUnicodeProperties.java. +# @missing: 0000..10FFFF; NFKC_Casefold; +# @missing: 0000..10FFFF; NFKC_SCF; + # Extras # @missing: 0000..10FFFF; Idn_Status ; disallowed diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/AdditionComparisons.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/AdditionComparisons.txt new file mode 100644 index 000000000..57b2f5e96 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/text/UCD/AdditionComparisons.txt @@ -0,0 +1,19 @@ +# This file uses the invariant test language, but contains comparisons between +# new and pre-existing characters to aid in PAG review of encoding proposals. + +## Unicode 16.0 additions. + +# These comparisons were not in place when properties were initially assigned for the 16.0 +# répertoire. +# We note here the feedback about errors that would have been caught by them. + +# U+18CFF is a blank character for the Khitan Small Script; aside from looking blank, +# it is indistinguishable from other Khitan Small Script characters. See L2/23-065. +# In particular, it is ideographic: https://www.unicode.org/review/pri497/feedback.html#ID20240216140104. +Propertywise [\N{KHITAN SMALL SCRIPT CHARACTER-18CFF} \N{KHITAN SMALL SCRIPT CHARACTER-18B00}] AreAlike, Except: Age + +# HXG (briefly known as HZXG) and SZP are just like all the other CJK strokes. +# In particular, they are scx=Hani: https://www.unicode.org/review/pri502/feedback.html#ID20240523095709. +Propertywise [\N{CJK STROKE T} \N{CJK STROKE HXG}\N{CJK STROKE SZP}] AreAlike, Except: Age + +## Provisionally assigned. [placeholder for draft PRs] \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt index d9cbb53a4..3eb759238 100644 --- a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt +++ b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt @@ -112,6 +112,20 @@ # OnPairsOf $strings, EqualityOf Case_Folding ⇏ EqualityOf Simple_Case_Folding # OnPairsOf $strings, EqualityOf Case_Folding ⇐ EqualityOf Simple_Case_Folding ########################## +# Propertywise AreAlike [, Except: ] +# +# Checks that all property assignments of the code points in are the same, +# except for the Name property and any properties listed in the space-separated +# Except clause. +# For the purposes of this check, if all characters in are mapped to themselves +# by some property with default value , these assignments are the same. +# +# Example: Propertywise [𐛪 𐛫] AreAlike +# These two Linear A signs (A751 and A752) behave identically. +# Example: Propertywise [ي ۑ] AreAlike, Except: Confusable_MA Unicode_1_Name +# This checks that yeh (with two dots) and yeh with three dots behave the same, +# except for confusability and their name in Unicode 1 (both have one, so it is different). +########################## # There is new syntax for testing UnicodeMaps # # Map @@ -1091,3 +1105,7 @@ In [\p{Block=Hangul Syllables} - \p{gc=Cn}], (prepend HANGUL SYLLABLE ) * (strin # https://www.unicode.org/review/pri497/feedback.html#ID20240216135149. In \p{Decomposition_Type=font}, Bidi_Class = Bidi_Class * Decomposition_Mapping +# Basic Propertywise tests. +Propertywise [𐛪 𐛫] AreAlike +Propertywise [ي ۑ] AreAlike, Except: Confusable_MA Unicode_1_Name + diff --git a/unicodetools/src/test/java/org/unicode/text/UCD/TestTestUnicodeInvariants.java b/unicodetools/src/test/java/org/unicode/text/UCD/TestTestUnicodeInvariants.java index f8a578325..9fe411807 100644 --- a/unicodetools/src/test/java/org/unicode/text/UCD/TestTestUnicodeInvariants.java +++ b/unicodetools/src/test/java/org/unicode/text/UCD/TestTestUnicodeInvariants.java @@ -35,6 +35,14 @@ void testUnicodeInvariants() throws IOException { assertEquals(0, rc, "TestUnicodeInvariants.testInvariants(default) failed"); } + @Test + void testAdditionComparisons() throws IOException { + int rc = + TestUnicodeInvariants.testInvariants( + "AdditionComparisons.txt", "addition-comparisons", true); + assertEquals(0, rc, "TestUnicodeInvariants.testInvariants(addition-comparisons) failed"); + } + @Test void testSecurityInvariants() throws IOException { int rc =