Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Propertywise … AreAlike #842

Merged
merged 20 commits into from
May 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -471,6 +471,9 @@ public synchronized UnicodeMap<String> load(UcdProperty prop2, boolean expectCac
final String fileName = fileInfo.getFileName(ucdVersion);

if (FILE_CACHE) {
// TODO(egg): When using cached property data, most defaults do not get
// loaded in PropertyParsingInfo, as that happens in parseSourceFile.
// Only the ones from the Extra files are loaded.
data0 = getCachedMap(prop2, fullFilename);
if (data0 != null) {
property2UnicodeMap.put(prop2, data0.freeze());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,11 @@
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.TreeMap;
import java.util.function.Function;
import java.util.regex.Pattern;
Expand All @@ -31,6 +33,8 @@
import org.unicode.jsp.ICUPropertyFactory;
import org.unicode.props.BagFormatter;
import org.unicode.props.IndexUnicodeProperties;
import org.unicode.props.IndexUnicodeProperties.DefaultValueType;
import org.unicode.props.UcdProperty;
import org.unicode.props.UnicodeProperty;
import org.unicode.props.UnicodeProperty.Factory;
import org.unicode.text.utility.Settings;
Expand Down Expand Up @@ -234,6 +238,8 @@ public static int testInvariants(String inputFile, String suffix, boolean doRang
letLine(pp, line);
} else if (line.startsWith("In")) {
inLine(pp, line, inputFile, lineNumber);
} else if (line.startsWith("Propertywise")) {
propertywiseLine(pp, line, inputFile, lineNumber);
} else if (line.startsWith("ShowScript")) {
showScript = true;
} else if (line.startsWith("HideScript")) {
Expand Down Expand Up @@ -326,6 +332,84 @@ protected String getFailure(int codepoint) {
}
}

private static void propertywiseLine(ParsePosition pp, String line, String file, int lineNumber)
throws ParseException {
pp.setIndex("Propertywise".length());
final UnicodeSet set = new UnicodeSet(line, pp, symbolTable);
if (set.hasStrings()) {
throw new ParseException(
"Set should contain only single code points for property comparison",
pp.getIndex());
}
expectToken("AreAlike", pp, line);
if (pp.getIndex() < line.length()) {
expectToken(",", pp, line);
expectToken("Except", pp, line);
expectToken(":", pp, line);
}
Set<String> excludedProperties = new HashSet<>();
excludedProperties.add("Name");
while (pp.getIndex() < line.length()) {
final int propertyNameStart = pp.getIndex();
scan(PATTERN_WHITE_SPACE, line, pp, false);
excludedProperties.add(line.substring(propertyNameStart, pp.getIndex()));
scan(PATTERN_WHITE_SPACE, line, pp, true);
}
final var iup = IndexUnicodeProperties.make(Settings.latestVersion);
final List<String> errorMessageLines = new ArrayList<>();
for (var p : UcdProperty.values()) {
final var property = iup.getProperty(p);
if (property.getNameAliases().stream()
.anyMatch(alias -> excludedProperties.contains(alias))) {
continue;
}
final int first = set.charAt(0);
String p1 = property.getValue(first);
for (var range : set.ranges()) {
for (int c = range.codepoint; c <= range.codepointEnd; ++c) {
if (c == first) {
continue;
}
String p2 = property.getValue(c);
if (!Objects.equals(p1, p2)) {
if (IndexUnicodeProperties.getResolvedDefaultValueType(p)
!= DefaultValueType.CODE_POINT
|| !p1.equals(Character.toString(first))
|| !p2.equals(Character.toString(c))) {
errorMessageLines.add(
property.getName()
+ "("
+ Character.toString(first)
+ ")\t=\t"
+ p1
+ "\t≠\t"
+ p2
+ "\t=\t"
+ property.getName()
+ "("
+ Character.toString(c)
+ ")");
}
}
}
}
}
if (!errorMessageLines.isEmpty()) {
testFailureCount++;
printErrorLine("Test Failure", Side.START, testFailureCount);
reportTestFailure(
file, lineNumber, String.join("\n", errorMessageLines).replace('\t', ' '));
out.println("<table class='f'>");
for (String errorMessageLine : errorMessageLines) {
out.println("<tr><td>");
out.println(toHTML.transform(errorMessageLine).replace("\t", "</td><td>"));
out.println("</tr></td>");
}
out.println("</table>");
printErrorLine("Test Failure", Side.END, testFailureCount);
}
}

private static void equivalencesLine(String line, ParsePosition pp, String file, int lineNumber)
throws ParseException {
pp.setIndex("OnPairsOf".length());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -101,9 +101,15 @@
# Overrides for bugs

# TODO(egg): These are specified in their respective files, we should not need them here.

# @missing: 0000..10FFFF; Bidi_Mirroring_Glyph; <none>
# @missing: 0000..10FFFF; Equivalent_Unified_Ideograph; <none>

# At least the following two appear to be needed because of issues related to caching;
# See comments in IndexUnicodeProperties.java.
# @missing: 0000..10FFFF; NFKC_Casefold; <code point>
# @missing: 0000..10FFFF; NFKC_SCF; <code point>

# Extras

# @missing: 0000..10FFFF; Idn_Status ; disallowed
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# This file uses the invariant test language, but contains comparisons between
# new and pre-existing characters to aid in PAG review of encoding proposals.

## Unicode 16.0 additions.

# These comparisons were not in place when properties were initially assigned for the 16.0
# répertoire.
# We note here the feedback about errors that would have been caught by them.

# U+18CFF is a blank character for the Khitan Small Script; aside from looking blank,
# it is indistinguishable from other Khitan Small Script characters. See L2/23-065.
# In particular, it is ideographic: https://www.unicode.org/review/pri497/feedback.html#ID20240216140104.
Propertywise [\N{KHITAN SMALL SCRIPT CHARACTER-18CFF} \N{KHITAN SMALL SCRIPT CHARACTER-18B00}] AreAlike, Except: Age

# HXG (briefly known as HZXG) and SZP are just like all the other CJK strokes.
# In particular, they are scx=Hani: https://www.unicode.org/review/pri502/feedback.html#ID20240523095709.
Propertywise [\N{CJK STROKE T} \N{CJK STROKE HXG}\N{CJK STROKE SZP}] AreAlike, Except: Age

## Provisionally assigned. [placeholder for draft PRs]
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,20 @@
# OnPairsOf $strings, EqualityOf Case_Folding ⇏ EqualityOf Simple_Case_Folding
# OnPairsOf $strings, EqualityOf Case_Folding ⇐ EqualityOf Simple_Case_Folding
##########################
# Propertywise <unicodeSet> AreAlike [, Except: <properties>]
#
# Checks that all property assignments of the code points in <unicodeSet> are the same,
# except for the Name property and any properties listed in the space-separated
# Except clause.
# For the purposes of this check, if all characters in <unicodeSet> are mapped to themselves
# by some property with default value <code point>, these assignments are the same.
#
# Example: Propertywise [𐛪 𐛫] AreAlike
# These two Linear A signs (A751 and A752) behave identically.
# Example: Propertywise [ي ۑ] AreAlike, Except: Confusable_MA Unicode_1_Name
# This checks that yeh (with two dots) and yeh with three dots behave the same,
# except for confusability and their name in Unicode 1 (both have one, so it is different).
##########################
# There is new syntax for testing UnicodeMaps
#
# Map <unicodeMap> <relation> <unicodeMap>
Expand Down Expand Up @@ -1091,3 +1105,7 @@ In [\p{Block=Hangul Syllables} - \p{gc=Cn}], (prepend HANGUL SYLLABLE ) * (strin
# https://www.unicode.org/review/pri497/feedback.html#ID20240216135149.
In \p{Decomposition_Type=font}, Bidi_Class = Bidi_Class * Decomposition_Mapping

# Basic Propertywise tests.
Propertywise [𐛪 𐛫] AreAlike
Propertywise [ي ۑ] AreAlike, Except: Confusable_MA Unicode_1_Name

Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,14 @@ void testUnicodeInvariants() throws IOException {
assertEquals(0, rc, "TestUnicodeInvariants.testInvariants(default) failed");
}

@Test
void testAdditionComparisons() throws IOException {
int rc =
TestUnicodeInvariants.testInvariants(
"AdditionComparisons.txt", "addition-comparisons", true);
assertEquals(0, rc, "TestUnicodeInvariants.testInvariants(addition-comparisons) failed");
}

@Test
void testSecurityInvariants() throws IOException {
int rc =
Expand Down
Loading