From df9c1738541244f46faca77b3eedb318a661d7d1 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Fri, 9 Feb 2024 00:26:18 +0100 Subject: [PATCH] Some invariants for the Equivalent_Unified_Ideograph of radicals (#691) --- .../text/UCD/TestUnicodeInvariants.java | 113 ++++++++++++------ .../unicode/text/UCD/UnicodeInvariantTest.txt | 27 ++++- 2 files changed, 97 insertions(+), 43 deletions(-) diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java index a30dab80d..40fc44b1b 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java @@ -270,11 +270,57 @@ public static int testInvariants(String inputFile, boolean doRange) throws IOExc return parseErrorCount + testFailureCount; } - static class PropertyComparison { + abstract static class PropertyPredicate { UnicodeSet valueSet; UnicodeProperty property1; + + public UnicodeMap getFailures() { + final UnicodeMap failures = new UnicodeMap<>(); + + for (final UnicodeSetIterator it = new UnicodeSetIterator(valueSet); it.next(); ) { + final String failure = getFailure(it.codepoint); + if (failure != null) { + failures.put(it.codepoint, failure); + } + } + return failures; + } + + // A description of the failure for the given codepoint, or null if the predicate holds. + protected abstract String getFailure(int codepoint); + } + + static class PropertyComparison extends PropertyPredicate { boolean shouldBeEqual; UnicodeProperty property2; + + @Override + protected String getFailure(int codepoint) { + final String value1 = property1.getValue(codepoint); + final String value2 = property2.getValue(codepoint); + final boolean areEqual = Objects.equals(value1, value2); + if (areEqual == shouldBeEqual) { + return null; + } else { + return value1 + (areEqual ? "=" : "≠") + value2; + } + } + } + + static class PropertyValueContainment extends PropertyPredicate { + boolean shouldBeInSet; + UnicodeSet set; + + @Override + protected String getFailure(int codepoint) { + final String value = property1.getValue(codepoint); + final boolean isInSet = set.contains(value); + if (isInSet == shouldBeInSet) { + return null; + } else { + return value + (isInSet ? "∈" : "∉") + set; + } + } } private static void equivalencesLine(String line, ParsePosition pp, int lineNumber) @@ -476,28 +522,14 @@ private static void equivalencesLine(String line, ParsePosition pp, int lineNumb private static void inLine(ParsePosition pp, String line, int lineNumber) throws ParseException { pp.setIndex(2); - final PropertyComparison propertyComparison = getPropertyComparison(pp, line); - final UnicodeMap failures = new UnicodeMap<>(); - - for (final UnicodeSetIterator it = new UnicodeSetIterator(propertyComparison.valueSet); - it.next(); ) { - final String value1 = propertyComparison.property1.getValue(it.codepoint); - final String value2 = propertyComparison.property2.getValue(it.codepoint); - final boolean areEqual = equals(value1, value2); - if (areEqual != propertyComparison.shouldBeEqual) { - failures.put(it.codepoint, value1 + (areEqual ? "=" : "≠") + value2); - } - } + final PropertyPredicate propertyPredicate = getPropertyPredicate(pp, line); + final UnicodeMap failures = propertyPredicate.getFailures(); final UnicodeSet failureSet = failures.keySet(); final int failureCount = failureSet.size(); if (failureCount != 0) { testFailureCount++; printErrorLine("Test Failure", Side.START, testFailureCount); - String errorMessage = - "Got unexpected " - + (propertyComparison.shouldBeEqual ? "differences" : "equalities") - + ": " - + failureCount; + String errorMessage = "Got unexpected property values: " + failureCount; println("## " + errorMessage); final UnicodeLabel failureProp = new UnicodeProperty.UnicodeMapProperty().set(failures); @@ -533,34 +565,41 @@ private static void expectToken(String token, ParsePosition pp, String line) scan(PATTERN_WHITE_SPACE, line, pp, true); } - private static PropertyComparison getPropertyComparison(ParsePosition pp, String line) + private static PropertyPredicate getPropertyPredicate(ParsePosition pp, String line) throws ParseException { - final PropertyComparison propertyComparison = new PropertyComparison(); + PropertyPredicate predicate; - propertyComparison.valueSet = new UnicodeSet(line, pp, symbolTable); + final UnicodeSet valueSet = new UnicodeSet(line, pp, symbolTable); expectToken(",", pp, line); - propertyComparison.property1 = CompoundProperty.of(LATEST_PROPS, line, pp); + final UnicodeProperty property1 = CompoundProperty.of(LATEST_PROPS, line, pp); final int cp = line.codePointAt(pp.getIndex()); - if (cp != '=' && cp != '≠') { - throw new ParseException(line, pp.getIndex()); + switch (cp) { + case '=': + case '≠': + final var comparison = new PropertyComparison(); + comparison.shouldBeEqual = cp == '='; + pp.setIndex(pp.getIndex() + 1); + comparison.property2 = CompoundProperty.of(LATEST_PROPS, line, pp); + predicate = comparison; + break; + case '∈': + case '∉': + final var containment = new PropertyValueContainment(); + containment.shouldBeInSet = cp == '∈'; + pp.setIndex(pp.getIndex() + 1); + containment.set = new UnicodeSet(line, pp, symbolTable); + predicate = containment; + break; + default: + throw new ParseException("Expected =|≠|∈|∉", pp.getIndex()); } - propertyComparison.shouldBeEqual = cp == '='; - pp.setIndex(pp.getIndex() + 1); - propertyComparison.property2 = CompoundProperty.of(LATEST_PROPS, line, pp); + predicate.valueSet = valueSet; + predicate.property1 = property1; scan(PATTERN_WHITE_SPACE, line, pp, true); if (pp.getIndex() != line.length()) { throw new ParseException(line, pp.getIndex()); } - return propertyComparison; - } - - private static boolean equals(Object value1, Object value2) { - if (value1 == null) { - return value2 == null; - } else if (value2 == null) { - return false; - } - return value1.equals(value2); + return predicate; } static class CompoundProperty extends UnicodeProperty { diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt index aaaabb445..eceb0f5ad 100644 --- a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt +++ b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt @@ -86,6 +86,9 @@ # then deduplicates runs of the same Bidi_Class. # It then compares that with the result of filtering out NSM characters from X, then getting the Bidi_Class. # +# In , (∈|∉) +# For each character in the first , verify that the result of applying the left +# is (∈|∉) the right-hand-side unicodeSet. ########################## # OnPairsOf , EqualityOf (⇐|⇔|⇒|⇍|⇎|⇏) EqualityOf # @@ -983,7 +986,7 @@ Let $nonIdeographicStrokes = \p{Name=/^CJK STROKE (T|WG|XG|BXG|SW|HZZ|HP|HZWG|SZ # Its value is a single unified ideograph. \P{Equivalent_Unified_Ideograph=@none@} ⊆ $strokesAndRadicals [$strokesAndRadicals - \P{Equivalent_Unified_Ideograph=@none@}] = [$nonIdeographicStrokes $nonIdeographicRadicals] -In \P{Equivalent_Unified_Ideograph=@none@}, Unified_Ideograph * Equivalent_Unified_Ideograph = (constant Yes) +In \P{Equivalent_Unified_Ideograph=@none@}, Equivalent_Unified_Ideograph ∈ \p{Unified_Ideograph} # Strokes are equivalent to a single-stroke ideograph, except for one strange one. # 𠄎 is the Equivalent_Unified_Ideograph of the stroke ㇡, but it has two strokes. @@ -991,15 +994,27 @@ In \P{Equivalent_Unified_Ideograph=@none@}, Unified_Ideograph * Equivalent_Unifi # https://www.zdic.net/hans/%F0%A0%84%8E which has it as 5.0, 1 stroke # (contrast https://www.zdic.net/hant/%F0%A0%84%8E which is 6.1, 2 strokes). # This has been submitted as feedback on PRI #483. +# If the kTotalStrokes value of 𠄎 gets changed to 1|2, the variable +# $strokesWith2StrokeLookalikes can be removed. Let $strokesWith2StrokeLookalikes = \N{CJK STROKE HZZZG} -In [$cjkStrokes - $nonIdeographicStrokes - $strokesWith2StrokeLookalikes], kTotalStrokes * Equivalent_Unified_Ideograph = (constant 1) +In [$cjkStrokes - $nonIdeographicStrokes - $strokesWith2StrokeLookalikes], Equivalent_Unified_Ideograph ∈ \p{kTotalStrokes=1} In $strokesWith2StrokeLookalikes, kTotalStrokes * Equivalent_Unified_Ideograph = (constant 2) -# TODO(egg): Some invariants for the Equivalent_Unified_Ideographs of radicals. -# Easy enough for the Kangxi radicals (they are kRSUnicode=n.0), but trickier for the radicals supplement. -# In particular I would expect those that are called SIMPLIFIED to have a '.0 or ''.0 kRSUnicode value, -# but some do not. +# Kangxi radicals are equivalent to those radicals with no residual strokes. +In $kangxiRadicals, Equivalent_Unified_Ideograph ∈ \p{kRSUnicode=/\.0/} +# Simplified radicals are equivalent to a simplified radical with no residual strokes. +# That is a Chinese simplified radical (kRSUnicode=n'.0) for Chinese simplified radicals, and a +# non-Chinese simplified radical (kRSUnicode=n''.0) otherwise. +# However, two of the simplified radicals are unifiable with their non-simplified counterparts, +# and are therefore equivalent to ideographs with kRSUnicode=n.0. +Let $radicalsWithUnifiableSimplifications = [角辶] +$radicalsWithUnifiableSimplifications ⊆ \p{kRSUnicode=/^[0-9]+\.0$/} +[$radicalsWithUnifiableSimplifications & \p{kRSUnicode=/^[0-9]+'\.0$/}] = [] +Let $chineseSimplifiedRadicals = \p{Name=/CJK RADICAL (C-)?SIMPLIFIED/} +Let $japaneseSimplifiedRadicals = \p{Name=/CJK RADICAL J-SIMPLIFIED/} +In $chineseSimplifiedRadicals, Equivalent_Unified_Ideograph ∈ [\p{kRSUnicode=/^[0-9]+'\.0$/} $radicalsWithUnifiableSimplifications] +In $japaneseSimplifiedRadicals, Equivalent_Unified_Ideograph ∈ \p{kRSUnicode=/^[0-9]+''\.0$/} # InPC-InSC-gc invariants # See https://www.unicode.org/L2/L2023/23200-category-invariants.pdf.