Skip to content

Commit

Permalink
Some invariants for the Equivalent_Unified_Ideograph of radicals (#691)
Browse files Browse the repository at this point in the history
  • Loading branch information
eggrobin authored Feb 8, 2024
1 parent 07795f5 commit df9c173
Show file tree
Hide file tree
Showing 2 changed files with 97 additions and 43 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -270,11 +270,57 @@ public static int testInvariants(String inputFile, boolean doRange) throws IOExc
return parseErrorCount + testFailureCount;
}

static class PropertyComparison {
abstract static class PropertyPredicate {
UnicodeSet valueSet;
UnicodeProperty property1;

public UnicodeMap<String> getFailures() {
final UnicodeMap<String> failures = new UnicodeMap<>();

for (final UnicodeSetIterator it = new UnicodeSetIterator(valueSet); it.next(); ) {
final String failure = getFailure(it.codepoint);
if (failure != null) {
failures.put(it.codepoint, failure);
}
}
return failures;
}

// A description of the failure for the given codepoint, or null if the predicate holds.
protected abstract String getFailure(int codepoint);
}

static class PropertyComparison extends PropertyPredicate {
boolean shouldBeEqual;
UnicodeProperty property2;

@Override
protected String getFailure(int codepoint) {
final String value1 = property1.getValue(codepoint);
final String value2 = property2.getValue(codepoint);
final boolean areEqual = Objects.equals(value1, value2);
if (areEqual == shouldBeEqual) {
return null;
} else {
return value1 + (areEqual ? "=" : "≠") + value2;
}
}
}

static class PropertyValueContainment extends PropertyPredicate {
boolean shouldBeInSet;
UnicodeSet set;

@Override
protected String getFailure(int codepoint) {
final String value = property1.getValue(codepoint);
final boolean isInSet = set.contains(value);
if (isInSet == shouldBeInSet) {
return null;
} else {
return value + (isInSet ? "∈" : "∉") + set;
}
}
}

private static void equivalencesLine(String line, ParsePosition pp, int lineNumber)
Expand Down Expand Up @@ -476,28 +522,14 @@ private static void equivalencesLine(String line, ParsePosition pp, int lineNumb
private static void inLine(ParsePosition pp, String line, int lineNumber)
throws ParseException {
pp.setIndex(2);
final PropertyComparison propertyComparison = getPropertyComparison(pp, line);
final UnicodeMap<String> failures = new UnicodeMap<>();

for (final UnicodeSetIterator it = new UnicodeSetIterator(propertyComparison.valueSet);
it.next(); ) {
final String value1 = propertyComparison.property1.getValue(it.codepoint);
final String value2 = propertyComparison.property2.getValue(it.codepoint);
final boolean areEqual = equals(value1, value2);
if (areEqual != propertyComparison.shouldBeEqual) {
failures.put(it.codepoint, value1 + (areEqual ? "=" : "≠") + value2);
}
}
final PropertyPredicate propertyPredicate = getPropertyPredicate(pp, line);
final UnicodeMap<String> failures = propertyPredicate.getFailures();
final UnicodeSet failureSet = failures.keySet();
final int failureCount = failureSet.size();
if (failureCount != 0) {
testFailureCount++;
printErrorLine("Test Failure", Side.START, testFailureCount);
String errorMessage =
"Got unexpected "
+ (propertyComparison.shouldBeEqual ? "differences" : "equalities")
+ ": "
+ failureCount;
String errorMessage = "Got unexpected property values: " + failureCount;
println("## " + errorMessage);

final UnicodeLabel failureProp = new UnicodeProperty.UnicodeMapProperty().set(failures);
Expand Down Expand Up @@ -533,34 +565,41 @@ private static void expectToken(String token, ParsePosition pp, String line)
scan(PATTERN_WHITE_SPACE, line, pp, true);
}

private static PropertyComparison getPropertyComparison(ParsePosition pp, String line)
private static PropertyPredicate getPropertyPredicate(ParsePosition pp, String line)
throws ParseException {
final PropertyComparison propertyComparison = new PropertyComparison();
PropertyPredicate predicate;

propertyComparison.valueSet = new UnicodeSet(line, pp, symbolTable);
final UnicodeSet valueSet = new UnicodeSet(line, pp, symbolTable);
expectToken(",", pp, line);
propertyComparison.property1 = CompoundProperty.of(LATEST_PROPS, line, pp);
final UnicodeProperty property1 = CompoundProperty.of(LATEST_PROPS, line, pp);
final int cp = line.codePointAt(pp.getIndex());
if (cp != '=' && cp != '≠') {
throw new ParseException(line, pp.getIndex());
switch (cp) {
case '=':
case '≠':
final var comparison = new PropertyComparison();
comparison.shouldBeEqual = cp == '=';
pp.setIndex(pp.getIndex() + 1);
comparison.property2 = CompoundProperty.of(LATEST_PROPS, line, pp);
predicate = comparison;
break;
case '∈':
case '∉':
final var containment = new PropertyValueContainment();
containment.shouldBeInSet = cp == '∈';
pp.setIndex(pp.getIndex() + 1);
containment.set = new UnicodeSet(line, pp, symbolTable);
predicate = containment;
break;
default:
throw new ParseException("Expected =|≠|∈|∉", pp.getIndex());
}
propertyComparison.shouldBeEqual = cp == '=';
pp.setIndex(pp.getIndex() + 1);
propertyComparison.property2 = CompoundProperty.of(LATEST_PROPS, line, pp);
predicate.valueSet = valueSet;
predicate.property1 = property1;
scan(PATTERN_WHITE_SPACE, line, pp, true);
if (pp.getIndex() != line.length()) {
throw new ParseException(line, pp.getIndex());
}
return propertyComparison;
}

private static boolean equals(Object value1, Object value2) {
if (value1 == null) {
return value2 == null;
} else if (value2 == null) {
return false;
}
return value1.equals(value2);
return predicate;
}

static class CompoundProperty extends UnicodeProperty {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,9 @@
# then deduplicates runs of the same Bidi_Class.
# It then compares that with the result of filtering out NSM characters from X, then getting the Bidi_Class.
#
# In <unicodeSet>, <props> (∈|∉) <unicodeSet>
# For each character in the first <unicodeSet>, verify that the result of applying the left <props>
# is (∈|∉) the right-hand-side unicodeSet.
##########################
# OnPairsOf <unicodeSet>, EqualityOf <props> (⇐|⇔|⇒|⇍|⇎|⇏) EqualityOf <props>
#
Expand Down Expand Up @@ -983,23 +986,35 @@ Let $nonIdeographicStrokes = \p{Name=/^CJK STROKE (T|WG|XG|BXG|SW|HZZ|HP|HZWG|SZ
# Its value is a single unified ideograph.
\P{Equivalent_Unified_Ideograph=@none@} ⊆ $strokesAndRadicals
[$strokesAndRadicals - \P{Equivalent_Unified_Ideograph=@none@}] = [$nonIdeographicStrokes $nonIdeographicRadicals]
In \P{Equivalent_Unified_Ideograph=@none@}, Unified_Ideograph * Equivalent_Unified_Ideograph = (constant Yes)
In \P{Equivalent_Unified_Ideograph=@none@}, Equivalent_Unified_Ideograph ∈ \p{Unified_Ideograph}

# Strokes are equivalent to a single-stroke ideograph, except for one strange one.
# 𠄎 is the Equivalent_Unified_Ideograph of the stroke ㇡, but it has two strokes.
# TODO(egg): It might have one stroke in CN, see the one-stroke G-source glyph and
# https://www.zdic.net/hans/%F0%A0%84%8E which has it as 5.0, 1 stroke
# (contrast https://www.zdic.net/hant/%F0%A0%84%8E which is 6.1, 2 strokes).
# This has been submitted as feedback on PRI #483.
# If the kTotalStrokes value of 𠄎 gets changed to 1|2, the variable
# $strokesWith2StrokeLookalikes can be removed.
Let $strokesWith2StrokeLookalikes = \N{CJK STROKE HZZZG}

In [$cjkStrokes - $nonIdeographicStrokes - $strokesWith2StrokeLookalikes], kTotalStrokes * Equivalent_Unified_Ideograph = (constant 1)
In [$cjkStrokes - $nonIdeographicStrokes - $strokesWith2StrokeLookalikes], Equivalent_Unified_Ideograph ∈ \p{kTotalStrokes=1}
In $strokesWith2StrokeLookalikes, kTotalStrokes * Equivalent_Unified_Ideograph = (constant 2)

# TODO(egg): Some invariants for the Equivalent_Unified_Ideographs of radicals.
# Easy enough for the Kangxi radicals (they are kRSUnicode=n.0), but trickier for the radicals supplement.
# In particular I would expect those that are called SIMPLIFIED to have a '.0 or ''.0 kRSUnicode value,
# but some do not.
# Kangxi radicals are equivalent to those radicals with no residual strokes.
In $kangxiRadicals, Equivalent_Unified_Ideograph ∈ \p{kRSUnicode=/\.0/}
# Simplified radicals are equivalent to a simplified radical with no residual strokes.
# That is a Chinese simplified radical (kRSUnicode=n'.0) for Chinese simplified radicals, and a
# non-Chinese simplified radical (kRSUnicode=n''.0) otherwise.
# However, two of the simplified radicals are unifiable with their non-simplified counterparts,
# and are therefore equivalent to ideographs with kRSUnicode=n.0.
Let $radicalsWithUnifiableSimplifications = [角辶]
$radicalsWithUnifiableSimplifications ⊆ \p{kRSUnicode=/^[0-9]+\.0$/}
[$radicalsWithUnifiableSimplifications & \p{kRSUnicode=/^[0-9]+'\.0$/}] = []
Let $chineseSimplifiedRadicals = \p{Name=/CJK RADICAL (C-)?SIMPLIFIED/}
Let $japaneseSimplifiedRadicals = \p{Name=/CJK RADICAL J-SIMPLIFIED/}
In $chineseSimplifiedRadicals, Equivalent_Unified_Ideograph ∈ [\p{kRSUnicode=/^[0-9]+'\.0$/} $radicalsWithUnifiableSimplifications]
In $japaneseSimplifiedRadicals, Equivalent_Unified_Ideograph ∈ \p{kRSUnicode=/^[0-9]+''\.0$/}

# InPC-InSC-gc invariants
# See https://www.unicode.org/L2/L2023/23200-category-invariants.pdf.
Expand Down

0 comments on commit df9c173

Please sign in to comment.