From 77e3ce601ff2970b1a9b8bc7cf5d9445fd4554e5 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Thu, 18 Jan 2024 07:48:10 +0100 Subject: [PATCH] Fix the handling of multivalued Unihan properties (#652) --- .../org/unicode/props/UnicodeProperty.java | 23 ++++++++++++------- .../unicode/text/UCD/UnicodeInvariantTest.txt | 13 +++++------ 2 files changed, 21 insertions(+), 15 deletions(-) diff --git a/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java b/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java index fc99730d8..016643106 100644 --- a/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java +++ b/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java @@ -286,14 +286,21 @@ public List getValueAliases(String valueAlias, List result) { if (result == null) result = new ArrayList<>(1); result = _getValueAliases(valueAlias, result); if (!result.contains(valueAlias)) { // FIX && type < NUMERIC - result = _getValueAliases(valueAlias, result); // for debugging - throw new IllegalArgumentException( - "Internal error: " - + getName() - + " doesn't contain " - + valueAlias - + ": " - + new BagFormatter().join(result)); + if (type == MISC) { + // Unihan has multivalued properties but does not use aliases. + result.add(valueAlias); + } else { + result = _getValueAliases(valueAlias, result); // for debugging + throw new IllegalArgumentException( + "Internal error: " + + getName() + + " (" + + getTypeName() + + ") doesn't contain " + + valueAlias + + ": " + + new BagFormatter().join(result)); + } } return result; } diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt index 7f8807958..34928fd9f 100644 --- a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt +++ b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt @@ -892,14 +892,13 @@ Let $ideohack = [〆 〇 〡-〩] # 10.0 added Nushu # 13.0 added Khitan_Small_Script -# Following don't work yet. Tested independently in TestInvariants -# \p{Unified_Ideograph} ⊇ \P{kRSUnicode=∅} +\p{Unified_Ideograph} ⊂ \p{kRSUnicode=/./} +\p{kRSUnicode=/./} = [\p{Block=/^CJK.(Unified|Compatibility).Ideographs/} - \p{gc=Cn}] +\p{kRSUnicode=/./} = \p{kTotalStrokes=/./} -# \p{Unified_Ideograph} = \P{kRSUnicode=∅} - -# \P{kRSUnicode=∅} = \P{kTotalStrokes=∅} - -# \P{kHanyuPinyin=∅} ⊇ \P{kMandarin=∅} +# TODO(eggrobin): Should those two have a kMandarin, or this not actually an invariant? +# TODO(macchiati): The kHanyuPinyin UnicodeSet is excruciatingly slow. +# \p{kHanyuPinyin=/./} - \p{kMandarin=/./} = [\{228F5}\x{2574C}] # InPC-InSC-gc invariants # See https://www.unicode.org/L2/L2023/23200-category-invariants.pdf.