Some invariants for the Equivalent_Unified_Ideograph of radicals (#691)

unicode-org · Feb 8, 2024 · df9c173 · df9c173
1 parent 07795f5
commit df9c173
Show file tree

Hide file tree

Showing 2 changed files with 97 additions and 43 deletions.
diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java
@@ -270,11 +270,57 @@ public static int testInvariants(String inputFile, boolean doRange) throws IOExc
         return parseErrorCount + testFailureCount;
     }
 
-    static class PropertyComparison {
+    abstract static class PropertyPredicate {
         UnicodeSet valueSet;
         UnicodeProperty property1;
+
+        public UnicodeMap<String> getFailures() {
+            final UnicodeMap<String> failures = new UnicodeMap<>();
+
+            for (final UnicodeSetIterator it = new UnicodeSetIterator(valueSet); it.next(); ) {
+                final String failure = getFailure(it.codepoint);
+                if (failure != null) {
+                    failures.put(it.codepoint, failure);
+                }
+            }
+            return failures;
+        }
+
+        // A description of the failure for the given codepoint, or null if the predicate holds.
+        protected abstract String getFailure(int codepoint);
+    }
+
+    static class PropertyComparison extends PropertyPredicate {
         boolean shouldBeEqual;
         UnicodeProperty property2;
+
+        @Override
+        protected String getFailure(int codepoint) {
+            final String value1 = property1.getValue(codepoint);
+            final String value2 = property2.getValue(codepoint);
+            final boolean areEqual = Objects.equals(value1, value2);
+            if (areEqual == shouldBeEqual) {
+                return null;
+            } else {
+                return value1 + (areEqual ? "=" : "≠") + value2;
+            }
+        }
+    }
+
+    static class PropertyValueContainment extends PropertyPredicate {
+        boolean shouldBeInSet;
+        UnicodeSet set;
+
+        @Override
+        protected String getFailure(int codepoint) {
+            final String value = property1.getValue(codepoint);
+            final boolean isInSet = set.contains(value);
+            if (isInSet == shouldBeInSet) {
+                return null;
+            } else {
+                return value + (isInSet ? "∈" : "∉") + set;
+            }
+        }
     }
 
     private static void equivalencesLine(String line, ParsePosition pp, int lineNumber)
@@ -476,28 +522,14 @@ private static void equivalencesLine(String line, ParsePosition pp, int lineNumb
     private static void inLine(ParsePosition pp, String line, int lineNumber)
             throws ParseException {
         pp.setIndex(2);
-        final PropertyComparison propertyComparison = getPropertyComparison(pp, line);
-        final UnicodeMap<String> failures = new UnicodeMap<>();
-
-        for (final UnicodeSetIterator it = new UnicodeSetIterator(propertyComparison.valueSet);
-                it.next(); ) {
-            final String value1 = propertyComparison.property1.getValue(it.codepoint);
-            final String value2 = propertyComparison.property2.getValue(it.codepoint);
-            final boolean areEqual = equals(value1, value2);
-            if (areEqual != propertyComparison.shouldBeEqual) {
-                failures.put(it.codepoint, value1 + (areEqual ? "=" : "≠") + value2);
-            }
-        }
+        final PropertyPredicate propertyPredicate = getPropertyPredicate(pp, line);
+        final UnicodeMap<String> failures = propertyPredicate.getFailures();
         final UnicodeSet failureSet = failures.keySet();
         final int failureCount = failureSet.size();
         if (failureCount != 0) {
             testFailureCount++;
             printErrorLine("Test Failure", Side.START, testFailureCount);
-            String errorMessage =
-                    "Got unexpected "
-                            + (propertyComparison.shouldBeEqual ? "differences" : "equalities")
-                            + ": "
-                            + failureCount;
+            String errorMessage = "Got unexpected property values: " + failureCount;
             println("## " + errorMessage);
 
             final UnicodeLabel failureProp = new UnicodeProperty.UnicodeMapProperty().set(failures);
@@ -533,34 +565,41 @@ private static void expectToken(String token, ParsePosition pp, String line)
         scan(PATTERN_WHITE_SPACE, line, pp, true);
     }
 
-    private static PropertyComparison getPropertyComparison(ParsePosition pp, String line)
+    private static PropertyPredicate getPropertyPredicate(ParsePosition pp, String line)
             throws ParseException {
-        final PropertyComparison propertyComparison = new PropertyComparison();
+        PropertyPredicate predicate;
 
-        propertyComparison.valueSet = new UnicodeSet(line, pp, symbolTable);
+        final UnicodeSet valueSet = new UnicodeSet(line, pp, symbolTable);
         expectToken(",", pp, line);
-        propertyComparison.property1 = CompoundProperty.of(LATEST_PROPS, line, pp);
+        final UnicodeProperty property1 = CompoundProperty.of(LATEST_PROPS, line, pp);
         final int cp = line.codePointAt(pp.getIndex());
-        if (cp != '=' && cp != '≠') {
-            throw new ParseException(line, pp.getIndex());
+        switch (cp) {
+            case '=':
+            case '≠':
+                final var comparison = new PropertyComparison();
+                comparison.shouldBeEqual = cp == '=';
+                pp.setIndex(pp.getIndex() + 1);
+                comparison.property2 = CompoundProperty.of(LATEST_PROPS, line, pp);
+                predicate = comparison;
+                break;
+            case '∈':
+            case '∉':
+                final var containment = new PropertyValueContainment();
+                containment.shouldBeInSet = cp == '∈';
+                pp.setIndex(pp.getIndex() + 1);
+                containment.set = new UnicodeSet(line, pp, symbolTable);
+                predicate = containment;
+                break;
+            default:
+                throw new ParseException("Expected =|≠|∈|∉", pp.getIndex());
         }
-        propertyComparison.shouldBeEqual = cp == '=';
-        pp.setIndex(pp.getIndex() + 1);
-        propertyComparison.property2 = CompoundProperty.of(LATEST_PROPS, line, pp);
+        predicate.valueSet = valueSet;
+        predicate.property1 = property1;
         scan(PATTERN_WHITE_SPACE, line, pp, true);
         if (pp.getIndex() != line.length()) {
             throw new ParseException(line, pp.getIndex());
         }
-        return propertyComparison;
-    }
-
-    private static boolean equals(Object value1, Object value2) {
-        if (value1 == null) {
-            return value2 == null;
-        } else if (value2 == null) {
-            return false;
-        }
-        return value1.equals(value2);
+        return predicate;
     }
 
     static class CompoundProperty extends UnicodeProperty {

diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt
@@ -86,6 +86,9 @@
 #       then deduplicates runs of the same Bidi_Class.
 #       It then compares that with the result of filtering out NSM characters from X, then getting the Bidi_Class.
 #
+# In <unicodeSet>, <props> (∈|∉) <unicodeSet>
+#   For each character in the first <unicodeSet>, verify that the result of applying the left <props>
+#   is (∈|∉) the right-hand-side unicodeSet.
 ##########################
 # OnPairsOf <unicodeSet>, EqualityOf <props> (⇐|⇔|⇒|⇍|⇎|⇏) EqualityOf <props>
 # 
@@ -983,23 +986,35 @@ Let $nonIdeographicStrokes = \p{Name=/^CJK STROKE (T|WG|XG|BXG|SW|HZZ|HP|HZWG|SZ
 # Its value is a single unified ideograph.
 \P{Equivalent_Unified_Ideograph=@none@} ⊆ $strokesAndRadicals
 [$strokesAndRadicals - \P{Equivalent_Unified_Ideograph=@none@}] = [$nonIdeographicStrokes $nonIdeographicRadicals]
-In \P{Equivalent_Unified_Ideograph=@none@}, Unified_Ideograph * Equivalent_Unified_Ideograph = (constant Yes)
+In \P{Equivalent_Unified_Ideograph=@none@}, Equivalent_Unified_Ideograph ∈ \p{Unified_Ideograph}
 
 # Strokes are equivalent to a single-stroke ideograph, except for one strange one.
 # 𠄎 is the Equivalent_Unified_Ideograph of the stroke ㇡, but it has two strokes.
 # TODO(egg): It might have one stroke in CN, see the one-stroke G-source glyph and
 # https://www.zdic.net/hans/%F0%A0%84%8E which has it as 5.0, 1 stroke
 # (contrast https://www.zdic.net/hant/%F0%A0%84%8E which is 6.1, 2 strokes).
 # This has been submitted as feedback on PRI #483.
+# If the kTotalStrokes value of 𠄎 gets changed to 1|2, the variable
+# $strokesWith2StrokeLookalikes can be removed.
 Let $strokesWith2StrokeLookalikes = \N{CJK STROKE HZZZG}
 
-In [$cjkStrokes - $nonIdeographicStrokes - $strokesWith2StrokeLookalikes], kTotalStrokes * Equivalent_Unified_Ideograph = (constant 1)
+In [$cjkStrokes - $nonIdeographicStrokes - $strokesWith2StrokeLookalikes], Equivalent_Unified_Ideograph ∈ \p{kTotalStrokes=1}
 In $strokesWith2StrokeLookalikes, kTotalStrokes * Equivalent_Unified_Ideograph = (constant 2)
 
-# TODO(egg): Some invariants for the Equivalent_Unified_Ideographs of radicals.
-# Easy enough for the Kangxi radicals (they are kRSUnicode=n.0), but trickier for the radicals supplement.
-# In particular I would expect those that are called SIMPLIFIED to have a '.0 or ''.0 kRSUnicode value,
-# but some do not.
+# Kangxi radicals are equivalent to those radicals with no residual strokes.
+In $kangxiRadicals, Equivalent_Unified_Ideograph ∈ \p{kRSUnicode=/\.0/}
+# Simplified radicals are equivalent to a simplified radical with no residual strokes.
+# That is a Chinese simplified radical (kRSUnicode=n'.0) for Chinese simplified radicals, and a
+# non-Chinese simplified radical (kRSUnicode=n''.0) otherwise.
+# However, two of the simplified radicals are unifiable with their non-simplified counterparts,
+# and are therefore equivalent to ideographs with kRSUnicode=n.0.
+Let $radicalsWithUnifiableSimplifications = [角辶]
+$radicalsWithUnifiableSimplifications ⊆ \p{kRSUnicode=/^[0-9]+\.0$/}
+[$radicalsWithUnifiableSimplifications & \p{kRSUnicode=/^[0-9]+'\.0$/}] = []
+Let $chineseSimplifiedRadicals = \p{Name=/CJK RADICAL (C-)?SIMPLIFIED/}
+Let $japaneseSimplifiedRadicals = \p{Name=/CJK RADICAL J-SIMPLIFIED/}
+In $chineseSimplifiedRadicals, Equivalent_Unified_Ideograph ∈ [\p{kRSUnicode=/^[0-9]+'\.0$/} $radicalsWithUnifiableSimplifications]
+In $japaneseSimplifiedRadicals, Equivalent_Unified_Ideograph ∈ \p{kRSUnicode=/^[0-9]+''\.0$/}
 
 # InPC-InSC-gc invariants
 # See https://www.unicode.org/L2/L2023/23200-category-invariants.pdf.