diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt index 9fa0ca3a0..b8f79fbe1 100644 --- a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt +++ b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt @@ -746,6 +746,37 @@ Let $PostBaseSpacingMarks_Tweak = [\u103B \u1056 \u1057 \u1A57 \u1A6D] Let $PostBaseSpacingMarks_Missed = [] [$PostBaseSpacingMarks_All - $PostBaseSpacingMarks_Tweak - $PostBaseSpacingMarks_Missed] ⊂ [:GCB=XX:] +# Check the consistency of grapheme cluster segmentation (both legacy and +# extended) with canonical equivalence. +# Non-starters are GCB=Extend or GCB=SpacingMark, so that GB9 and GB9a keep +# together any sequences that may be reordered by the Canonical Ordering +# Algorithm. This has been true ever since Extended Grapheme Clusters were +# added. +\P{U5.1.0:ccc=0} ⊆ [\p{U5.1.0:GCB=Extend}\p{U5.1.0:GCB=SpacingMark}] +\P{ccc=0} ⊆ [\p{GCB=Extend}\p{GCB=SpacingMark}] +# Non-starters are actually GCB=Extend, so that GB9 alone does the job, since +# there is no GB9a in legacy grapheme clusters. +# But not before Unicode Version 16.0, even though we were saying so since +# Unicode Version 4.0 (https://www.unicode.org/reports/tr29/tr29-4.html#Implementation_Notes), +# oops (see L2/24-009). +\P{U4.0.0:ccc=0} ⊆ \p{U4.0.0:Grapheme_Extend} +\P{U4.1.0:ccc=0} ⊆ \p{U4.1.0:GCB=Extend} +\P{U15.1.0:ccc=0} ⊆ \p{U15.1.0:GCB=Extend} +\P{ccc=0} ⊆ \p{GCB=Extend} + +# Characters that appear in non-initial position in the canonical decomposition +# of another character are either Extend, V, or T, so that sequences that are +# equivalent to a canonical composite are kept together by GB6..GB9. +# We only look at the starters, since we dealt with non-starters above. +# Characters that appear in non-initial position in the canonical decomposition +# of a primary composite are NFC_QC=Maybe. We would need to separately check +# the characters that appear in non-initial position in the canonical +# decomposition of a full composition exclusion. +# We would also need to separately check that the characters are T or V only +# appear in canonical decompositions where they follow an LV, LVT, V, or T, or +# an LV or V, respectively. +[\p{NFC_QC=Maybe}&\p{ccc=0}] ⊆ [\p{GCB=Extend}\p{GCB=T}\p{GCB=V}] + ########################## # Emoji ##########################