diff --git a/unicodetools/data/ucd/dev/DerivedCoreProperties.txt b/unicodetools/data/ucd/dev/DerivedCoreProperties.txt index ff4eeb225..702946730 100644 --- a/unicodetools/data/ucd/dev/DerivedCoreProperties.txt +++ b/unicodetools/data/ucd/dev/DerivedCoreProperties.txt @@ -1,5 +1,5 @@ # DerivedCoreProperties-16.0.0.txt -# Date: 2024-01-23, 01:50:59 GMT +# Date: 2024-01-26, 18:15:15 GMT # © 2023 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see https://www.unicode.org/terms_of_use.html @@ -10746,7 +10746,9 @@ E01F0..E0FFF ; Default_Ignorable_Code_Point # Cn [3600] ........ 109D ; Extend # Mn MYANMAR VOWEL SIGN AITON AI 135D..135F ; Extend # Mn [3] ETHIOPIC COMBINING GEMINATION AND VOWEL LENGTH MARK..ETHIOPIC COMBINING GEMINATION MARK 1712..1714 ; Extend # Mn [3] TAGALOG VOWEL SIGN I..TAGALOG SIGN VIRAMA +1715 ; Extend # Mc TAGALOG SIGN PAMUDPOD 1732..1733 ; Extend # Mn [2] HANUNOO VOWEL SIGN I..HANUNOO VOWEL SIGN U +1734 ; Extend # Mc HANUNOO SIGN PAMUDPOD 1752..1753 ; Extend # Mn [2] BUHID VOWEL SIGN I..BUHID VOWEL SIGN U 1772..1773 ; Extend # Mn [2] TAGBANWA VOWEL SIGN I..TAGBANWA VOWEL SIGN U 17B4..17B5 ; Extend # Mn [2] KHMER VOWEL INHERENT AQ..KHMER VOWEL INHERENT AA @@ -245,15 +247,18 @@ E01F0..E0FFF ; Control # Cn [3600] .. 1B36..1B3A ; Extend # Mn [5] BALINESE VOWEL SIGN ULU..BALINESE VOWEL SIGN RA REPA 1B3C ; Extend # Mn BALINESE VOWEL SIGN LA LENGA 1B42 ; Extend # Mn BALINESE VOWEL SIGN PEPET +1B44 ; Extend # Mc BALINESE ADEG ADEG 1B6B..1B73 ; Extend # Mn [9] BALINESE MUSICAL SYMBOL COMBINING TEGEH..BALINESE MUSICAL SYMBOL COMBINING GONG 1B80..1B81 ; Extend # Mn [2] SUNDANESE SIGN PANYECEK..SUNDANESE SIGN PANGLAYAR 1BA2..1BA5 ; Extend # Mn [4] SUNDANESE CONSONANT SIGN PANYAKRA..SUNDANESE VOWEL SIGN PANYUKU 1BA8..1BA9 ; Extend # Mn [2] SUNDANESE VOWEL SIGN PAMEPET..SUNDANESE VOWEL SIGN PANEULEUNG +1BAA ; Extend # Mc SUNDANESE SIGN PAMAAEH 1BAB..1BAD ; Extend # Mn [3] SUNDANESE SIGN VIRAMA..SUNDANESE CONSONANT SIGN PASANGAN WA 1BE6 ; Extend # Mn BATAK SIGN TOMPI 1BE8..1BE9 ; Extend # Mn [2] BATAK VOWEL SIGN PAKPAK E..BATAK VOWEL SIGN EE 1BED ; Extend # Mn BATAK VOWEL SIGN KARO O 1BEF..1BF1 ; Extend # Mn [3] BATAK VOWEL SIGN U FOR SIMALUNGUN SA..BATAK CONSONANT SIGN H +1BF2..1BF3 ; Extend # Mc [2] BATAK PANGOLAT..BATAK PANONGONAN 1C2C..1C33 ; Extend # Mn [8] LEPCHA VOWEL SIGN E..LEPCHA CONSONANT SIGN T 1C36..1C37 ; Extend # Mn [2] LEPCHA SIGN RAN..LEPCHA SIGN NUKTA 1CD0..1CD2 ; Extend # Mn [3] VEDIC TONE KARSHANA..VEDIC TONE PRENKHA @@ -290,10 +295,12 @@ A8E0..A8F1 ; Extend # Mn [18] COMBINING DEVANAGARI DIGIT ZERO..COMBINING DEV A8FF ; Extend # Mn DEVANAGARI VOWEL SIGN AY A926..A92D ; Extend # Mn [8] KAYAH LI VOWEL UE..KAYAH LI TONE CALYA PLOPHU A947..A951 ; Extend # Mn [11] REJANG VOWEL SIGN I..REJANG CONSONANT SIGN R +A953 ; Extend # Mc REJANG VIRAMA A980..A982 ; Extend # Mn [3] JAVANESE SIGN PANYANGGA..JAVANESE SIGN LAYAR A9B3 ; Extend # Mn JAVANESE SIGN CECAK TELU A9B6..A9B9 ; Extend # Mn [4] JAVANESE VOWEL SIGN WULU..JAVANESE VOWEL SIGN SUKU MENDUT A9BC..A9BD ; Extend # Mn [2] JAVANESE VOWEL SIGN PEPET..JAVANESE CONSONANT SIGN KERET +A9C0 ; Extend # Mc JAVANESE PANGKON A9E5 ; Extend # Mn MYANMAR SIGN SHAN SAW AA29..AA2E ; Extend # Mn [6] CHAM VOWEL SIGN AA..CHAM VOWEL SIGN OE AA31..AA32 ; Extend # Mn [2] CHAM VOWEL SIGN AU..CHAM VOWEL SIGN UE @@ -344,10 +351,12 @@ FF9E..FF9F ; Extend # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDT 11173 ; Extend # Mn MAHAJANI SIGN NUKTA 11180..11181 ; Extend # Mn [2] SHARADA SIGN CANDRABINDU..SHARADA SIGN ANUSVARA 111B6..111BE ; Extend # Mn [9] SHARADA VOWEL SIGN U..SHARADA VOWEL SIGN O +111C0 ; Extend # Mc SHARADA SIGN VIRAMA 111C9..111CC ; Extend # Mn [4] SHARADA SANDHI MARK..SHARADA EXTRA SHORT VOWEL MARK 111CF ; Extend # Mn SHARADA SIGN INVERTED CANDRABINDU 1122F..11231 ; Extend # Mn [3] KHOJKI VOWEL SIGN U..KHOJKI VOWEL SIGN AI 11234 ; Extend # Mn KHOJKI SIGN ANUSVARA +11235 ; Extend # Mc KHOJKI SIGN VIRAMA 11236..11237 ; Extend # Mn [2] KHOJKI SIGN NUKTA..KHOJKI SIGN SHADDA 1123E ; Extend # Mn KHOJKI SIGN SUKUN 11241 ; Extend # Mn KHOJKI VOWEL SIGN VOCALIC R @@ -357,11 +366,17 @@ FF9E..FF9F ; Extend # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDT 1133B..1133C ; Extend # Mn [2] COMBINING BINDU BELOW..GRANTHA SIGN NUKTA 1133E ; Extend # Mc GRANTHA VOWEL SIGN AA 11340 ; Extend # Mn GRANTHA VOWEL SIGN II +1134D ; Extend # Mc GRANTHA SIGN VIRAMA 11357 ; Extend # Mc GRANTHA AU LENGTH MARK 11366..1136C ; Extend # Mn [7] COMBINING GRANTHA DIGIT ZERO..COMBINING GRANTHA DIGIT SIX 11370..11374 ; Extend # Mn [5] COMBINING GRANTHA LETTER A..COMBINING GRANTHA LETTER PA +113B8 ; Extend # Mc TULU-TIGALARI VOWEL SIGN AA 113BB..113C0 ; Extend # Mn [6] TULU-TIGALARI VOWEL SIGN U..TULU-TIGALARI VOWEL SIGN VOCALIC LL +113C2 ; Extend # Mc TULU-TIGALARI VOWEL SIGN EE +113C5 ; Extend # Mc TULU-TIGALARI VOWEL SIGN AI +113C7..113C9 ; Extend # Mc [3] TULU-TIGALARI VOWEL SIGN OO..TULU-TIGALARI AU LENGTH MARK 113CE ; Extend # Mn TULU-TIGALARI SIGN VIRAMA +113CF ; Extend # Mc TULU-TIGALARI SIGN LOOPED VIRAMA 113D0 ; Extend # Mn TULU-TIGALARI CONJOINER 113D2 ; Extend # Mn TULU-TIGALARI GEMINATION MARK 113E1..113E2 ; Extend # Mn [2] TULU-TIGALARI VEDIC TONE SVARITA..TULU-TIGALARI VEDIC TONE ANUDATTA @@ -386,6 +401,7 @@ FF9E..FF9F ; Extend # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDT 116AB ; Extend # Mn TAKRI SIGN ANUSVARA 116AD ; Extend # Mn TAKRI VOWEL SIGN AA 116B0..116B5 ; Extend # Mn [6] TAKRI VOWEL SIGN U..TAKRI VOWEL SIGN AU +116B6 ; Extend # Mc TAKRI SIGN VIRAMA 116B7 ; Extend # Mn TAKRI SIGN NUKTA 1171D ; Extend # Mn AHOM CONSONANT SIGN MEDIAL LA 1171F ; Extend # Mn AHOM CONSONANT SIGN MEDIAL LIGATING RA @@ -395,6 +411,7 @@ FF9E..FF9F ; Extend # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDT 11839..1183A ; Extend # Mn [2] DOGRA SIGN VIRAMA..DOGRA SIGN NUKTA 11930 ; Extend # Mc DIVES AKURU VOWEL SIGN AA 1193B..1193C ; Extend # Mn [2] DIVES AKURU SIGN ANUSVARA..DIVES AKURU SIGN CANDRABINDU +1193D ; Extend # Mc DIVES AKURU SIGN HALANTA 1193E ; Extend # Mn DIVES AKURU VIRAMA 11943 ; Extend # Mn DIVES AKURU SIGN NUKTA 119D4..119D7 ; Extend # Mn [4] NANDINAGARI VOWEL SIGN U..NANDINAGARI VOWEL SIGN VOCALIC RR @@ -427,6 +444,7 @@ FF9E..FF9F ; Extend # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDT 11F00..11F01 ; Extend # Mn [2] KAWI SIGN CANDRABINDU..KAWI SIGN ANUSVARA 11F36..11F3A ; Extend # Mn [5] KAWI VOWEL SIGN I..KAWI VOWEL SIGN VOCALIC R 11F40 ; Extend # Mn KAWI VOWEL SIGN EU +11F41 ; Extend # Mc KAWI SIGN KILLER 11F42 ; Extend # Mn KAWI CONJOINER 11F5A ; Extend # Mn KAWI SIGN NUKTA 13440 ; Extend # Mn EGYPTIAN HIEROGLYPH MIRROR HORIZONTALLY @@ -438,12 +456,13 @@ FF9E..FF9F ; Extend # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDT 16F4F ; Extend # Mn MIAO SIGN CONSONANT MODIFIER BAR 16F8F..16F92 ; Extend # Mn [4] MIAO TONE RIGHT..MIAO TONE BELOW 16FE4 ; Extend # Mn KHITAN SMALL SCRIPT FILLER +16FF0..16FF1 ; Extend # Mc [2] VIETNAMESE ALTERNATE READING MARK CA..VIETNAMESE ALTERNATE READING MARK NHAY 1BC9D..1BC9E ; Extend # Mn [2] DUPLOYAN THICK LETTER SELECTOR..DUPLOYAN DOUBLE MARK 1CF00..1CF2D ; Extend # Mn [46] ZNAMENNY COMBINING MARK GORAZDO NIZKO S KRYZHEM ON LEFT..ZNAMENNY COMBINING MARK KRYZH ON LEFT 1CF30..1CF46 ; Extend # Mn [23] ZNAMENNY COMBINING TONAL RANGE MARK MRACHNO..ZNAMENNY PRIZNAK MODIFIER ROG -1D165 ; Extend # Mc MUSICAL SYMBOL COMBINING STEM +1D165..1D166 ; Extend # Mc [2] MUSICAL SYMBOL COMBINING STEM..MUSICAL SYMBOL COMBINING SPRECHGESANG STEM 1D167..1D169 ; Extend # Mn [3] MUSICAL SYMBOL COMBINING TREMOLO-1..MUSICAL SYMBOL COMBINING TREMOLO-3 -1D16E..1D172 ; Extend # Mc [5] MUSICAL SYMBOL COMBINING FLAG-1..MUSICAL SYMBOL COMBINING FLAG-5 +1D16D..1D172 ; Extend # Mc [6] MUSICAL SYMBOL COMBINING AUGMENTATION DOT..MUSICAL SYMBOL COMBINING FLAG-5 1D17B..1D182 ; Extend # Mn [8] MUSICAL SYMBOL COMBINING ACCENT..MUSICAL SYMBOL COMBINING LOURE 1D185..1D18B ; Extend # Mn [7] MUSICAL SYMBOL COMBINING DOIT..MUSICAL SYMBOL COMBINING TRIPLE TONGUE 1D1AA..1D1AD ; Extend # Mn [4] MUSICAL SYMBOL COMBINING DOWN BOW..MUSICAL SYMBOL COMBINING SNAP PIZZICATO @@ -471,7 +490,7 @@ FF9E..FF9F ; Extend # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDT E0020..E007F ; Extend # Cf [96] TAG SPACE..CANCEL TAG E0100..E01EF ; Extend # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256 -# Total code points: 2165 +# Total code points: 2190 # ================================================ @@ -529,8 +548,6 @@ E0100..E01EF ; Extend # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256 103B..103C ; SpacingMark # Mc [2] MYANMAR CONSONANT SIGN MEDIAL YA..MYANMAR CONSONANT SIGN MEDIAL RA 1056..1057 ; SpacingMark # Mc [2] MYANMAR VOWEL SIGN VOCALIC R..MYANMAR VOWEL SIGN VOCALIC RR 1084 ; SpacingMark # Mc MYANMAR VOWEL SIGN SHAN E -1715 ; SpacingMark # Mc TAGALOG SIGN PAMUDPOD -1734 ; SpacingMark # Mc HANUNOO SIGN PAMUDPOD 17B6 ; SpacingMark # Mc KHMER VOWEL SIGN AA 17BE..17C5 ; SpacingMark # Mc [8] KHMER VOWEL SIGN OE..KHMER VOWEL SIGN AU 17C7..17C8 ; SpacingMark # Mc [2] KHMER SIGN REAHMUK..KHMER SIGN YUUKALEAPINTU @@ -545,15 +562,13 @@ E0100..E01EF ; Extend # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256 1B04 ; SpacingMark # Mc BALINESE SIGN BISAH 1B3B ; SpacingMark # Mc BALINESE VOWEL SIGN RA REPA TEDUNG 1B3D..1B41 ; SpacingMark # Mc [5] BALINESE VOWEL SIGN LA LENGA TEDUNG..BALINESE VOWEL SIGN TALING REPA TEDUNG -1B43..1B44 ; SpacingMark # Mc [2] BALINESE VOWEL SIGN PEPET TEDUNG..BALINESE ADEG ADEG +1B43 ; SpacingMark # Mc BALINESE VOWEL SIGN PEPET TEDUNG 1B82 ; SpacingMark # Mc SUNDANESE SIGN PANGWISAD 1BA1 ; SpacingMark # Mc SUNDANESE CONSONANT SIGN PAMINGKAL 1BA6..1BA7 ; SpacingMark # Mc [2] SUNDANESE VOWEL SIGN PANAELAENG..SUNDANESE VOWEL SIGN PANOLONG -1BAA ; SpacingMark # Mc SUNDANESE SIGN PAMAAEH 1BE7 ; SpacingMark # Mc BATAK VOWEL SIGN E 1BEA..1BEC ; SpacingMark # Mc [3] BATAK VOWEL SIGN I..BATAK VOWEL SIGN O 1BEE ; SpacingMark # Mc BATAK VOWEL SIGN U -1BF2..1BF3 ; SpacingMark # Mc [2] BATAK PANGOLAT..BATAK PANONGONAN 1C24..1C2B ; SpacingMark # Mc [8] LEPCHA SUBJOINED LETTER YA..LEPCHA VOWEL SIGN UU 1C34..1C35 ; SpacingMark # Mc [2] LEPCHA CONSONANT SIGN NYIN-DO..LEPCHA CONSONANT SIGN KANG 1CE1 ; SpacingMark # Mc VEDIC TONE ATHARVAVEDIC INDEPENDENT SVARITA @@ -562,11 +577,11 @@ A823..A824 ; SpacingMark # Mc [2] SYLOTI NAGRI VOWEL SIGN A..SYLOTI NAGRI V A827 ; SpacingMark # Mc SYLOTI NAGRI VOWEL SIGN OO A880..A881 ; SpacingMark # Mc [2] SAURASHTRA SIGN ANUSVARA..SAURASHTRA SIGN VISARGA A8B4..A8C3 ; SpacingMark # Mc [16] SAURASHTRA CONSONANT SIGN HAARU..SAURASHTRA VOWEL SIGN AU -A952..A953 ; SpacingMark # Mc [2] REJANG CONSONANT SIGN H..REJANG VIRAMA +A952 ; SpacingMark # Mc REJANG CONSONANT SIGN H A983 ; SpacingMark # Mc JAVANESE SIGN WIGNYAN A9B4..A9B5 ; SpacingMark # Mc [2] JAVANESE VOWEL SIGN TARUNG..JAVANESE VOWEL SIGN TOLONG A9BA..A9BB ; SpacingMark # Mc [2] JAVANESE VOWEL SIGN TALING..JAVANESE VOWEL SIGN DIRGA MURE -A9BE..A9C0 ; SpacingMark # Mc [3] JAVANESE CONSONANT SIGN PENGKAL..JAVANESE PANGKON +A9BE..A9BF ; SpacingMark # Mc [2] JAVANESE CONSONANT SIGN PENGKAL..JAVANESE CONSONANT SIGN CAKRA AA2F..AA30 ; SpacingMark # Mc [2] CHAM VOWEL SIGN O..CHAM VOWEL SIGN AI AA33..AA34 ; SpacingMark # Mc [2] CHAM CONSONANT SIGN YA..CHAM CONSONANT SIGN RA AA4D ; SpacingMark # Mc CHAM CONSONANT SIGN FINAL H @@ -586,24 +601,20 @@ ABEC ; SpacingMark # Mc MEETEI MAYEK LUM IYEK 11145..11146 ; SpacingMark # Mc [2] CHAKMA VOWEL SIGN AA..CHAKMA VOWEL SIGN EI 11182 ; SpacingMark # Mc SHARADA SIGN VISARGA 111B3..111B5 ; SpacingMark # Mc [3] SHARADA VOWEL SIGN AA..SHARADA VOWEL SIGN II -111BF..111C0 ; SpacingMark # Mc [2] SHARADA VOWEL SIGN AU..SHARADA SIGN VIRAMA +111BF ; SpacingMark # Mc SHARADA VOWEL SIGN AU 111CE ; SpacingMark # Mc SHARADA VOWEL SIGN PRISHTHAMATRA E 1122C..1122E ; SpacingMark # Mc [3] KHOJKI VOWEL SIGN AA..KHOJKI VOWEL SIGN II 11232..11233 ; SpacingMark # Mc [2] KHOJKI VOWEL SIGN O..KHOJKI VOWEL SIGN AU -11235 ; SpacingMark # Mc KHOJKI SIGN VIRAMA 112E0..112E2 ; SpacingMark # Mc [3] KHUDAWADI VOWEL SIGN AA..KHUDAWADI VOWEL SIGN II 11302..11303 ; SpacingMark # Mc [2] GRANTHA SIGN ANUSVARA..GRANTHA SIGN VISARGA 1133F ; SpacingMark # Mc GRANTHA VOWEL SIGN I 11341..11344 ; SpacingMark # Mc [4] GRANTHA VOWEL SIGN U..GRANTHA VOWEL SIGN VOCALIC RR 11347..11348 ; SpacingMark # Mc [2] GRANTHA VOWEL SIGN EE..GRANTHA VOWEL SIGN AI -1134B..1134D ; SpacingMark # Mc [3] GRANTHA VOWEL SIGN OO..GRANTHA SIGN VIRAMA +1134B..1134C ; SpacingMark # Mc [2] GRANTHA VOWEL SIGN OO..GRANTHA VOWEL SIGN AU 11362..11363 ; SpacingMark # Mc [2] GRANTHA VOWEL SIGN VOCALIC L..GRANTHA VOWEL SIGN VOCALIC LL -113B8..113BA ; SpacingMark # Mc [3] TULU-TIGALARI VOWEL SIGN AA..TULU-TIGALARI VOWEL SIGN II -113C2 ; SpacingMark # Mc TULU-TIGALARI VOWEL SIGN EE -113C5 ; SpacingMark # Mc TULU-TIGALARI VOWEL SIGN AI -113C7..113CA ; SpacingMark # Mc [4] TULU-TIGALARI VOWEL SIGN OO..TULU-TIGALARI SIGN CANDRA ANUNASIKA +113B9..113BA ; SpacingMark # Mc [2] TULU-TIGALARI VOWEL SIGN I..TULU-TIGALARI VOWEL SIGN II +113CA ; SpacingMark # Mc TULU-TIGALARI SIGN CANDRA ANUNASIKA 113CC..113CD ; SpacingMark # Mc [2] TULU-TIGALARI SIGN ANUSVARA..TULU-TIGALARI SIGN VISARGA -113CF ; SpacingMark # Mc TULU-TIGALARI SIGN LOOPED VIRAMA 11435..11437 ; SpacingMark # Mc [3] NEWA VOWEL SIGN AA..NEWA VOWEL SIGN II 11440..11441 ; SpacingMark # Mc [2] NEWA VOWEL SIGN O..NEWA VOWEL SIGN AU 11445 ; SpacingMark # Mc NEWA SIGN VISARGA @@ -620,14 +631,12 @@ ABEC ; SpacingMark # Mc MEETEI MAYEK LUM IYEK 1163E ; SpacingMark # Mc MODI SIGN VISARGA 116AC ; SpacingMark # Mc TAKRI SIGN VISARGA 116AE..116AF ; SpacingMark # Mc [2] TAKRI VOWEL SIGN I..TAKRI VOWEL SIGN II -116B6 ; SpacingMark # Mc TAKRI SIGN VIRAMA 1171E ; SpacingMark # Mc AHOM CONSONANT SIGN MEDIAL RA 11726 ; SpacingMark # Mc AHOM VOWEL SIGN E 1182C..1182E ; SpacingMark # Mc [3] DOGRA VOWEL SIGN AA..DOGRA VOWEL SIGN II 11838 ; SpacingMark # Mc DOGRA SIGN VISARGA 11931..11935 ; SpacingMark # Mc [5] DIVES AKURU VOWEL SIGN I..DIVES AKURU VOWEL SIGN E 11937..11938 ; SpacingMark # Mc [2] DIVES AKURU VOWEL SIGN AI..DIVES AKURU VOWEL SIGN O -1193D ; SpacingMark # Mc DIVES AKURU SIGN HALANTA 11940 ; SpacingMark # Mc DIVES AKURU MEDIAL YA 11942 ; SpacingMark # Mc DIVES AKURU MEDIAL RA 119D1..119D3 ; SpacingMark # Mc [3] NANDINAGARI VOWEL SIGN AA..NANDINAGARI VOWEL SIGN II @@ -648,14 +657,10 @@ ABEC ; SpacingMark # Mc MEETEI MAYEK LUM IYEK 11F03 ; SpacingMark # Mc KAWI SIGN VISARGA 11F34..11F35 ; SpacingMark # Mc [2] KAWI VOWEL SIGN AA..KAWI VOWEL SIGN ALTERNATE AA 11F3E..11F3F ; SpacingMark # Mc [2] KAWI VOWEL SIGN E..KAWI VOWEL SIGN AI -11F41 ; SpacingMark # Mc KAWI SIGN KILLER 1612A..1612C ; SpacingMark # Mc [3] GURUNG KHEMA CONSONANT SIGN MEDIAL YA..GURUNG KHEMA CONSONANT SIGN MEDIAL HA 16F51..16F87 ; SpacingMark # Mc [55] MIAO SIGN ASPIRATION..MIAO VOWEL SIGN UI -16FF0..16FF1 ; SpacingMark # Mc [2] VIETNAMESE ALTERNATE READING MARK CA..VIETNAMESE ALTERNATE READING MARK NHAY -1D166 ; SpacingMark # Mc MUSICAL SYMBOL COMBINING SPRECHGESANG STEM -1D16D ; SpacingMark # Mc MUSICAL SYMBOL COMBINING AUGMENTATION DOT -# Total code points: 411 +# Total code points: 386 # ================================================ diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt index 76ae34cb6..095683a25 100644 --- a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt +++ b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt @@ -757,6 +757,40 @@ Let $PostBaseSpacingMarks_Tweak = [\u103B \u1056 \u1057 \u1A57 \u1A6D] Let $PostBaseSpacingMarks_Missed = [] [$PostBaseSpacingMarks_All - $PostBaseSpacingMarks_Tweak - $PostBaseSpacingMarks_Missed] ⊂ [:GCB=XX:] +# Check the consistency of grapheme cluster segmentation (both legacy and +# extended) with canonical equivalence. +# Non-starters are GCB=Extend or GCB=SpacingMark, so that GB9 and GB9a keep +# together any sequences that may be reordered by the Canonical Ordering +# Algorithm. This has been true ever since Extended Grapheme Clusters were +# added. +\P{U5.1.0:ccc=0} ⊆ [\p{U5.1.0:GCB=Extend}\p{U5.1.0:GCB=SpacingMark}] +\P{ccc=0} ⊆ [\p{GCB=Extend}\p{GCB=SpacingMark}] +# Non-starters are actually GCB=Extend, so that GB9 alone does the job, since +# there is no GB9a in legacy grapheme clusters. +# But not before Unicode Version 16.0, even though we were saying so since +# Unicode Version 4.0 (https://www.unicode.org/reports/tr29/tr29-4.html#Implementation_Notes), +# oops (see L2/24-009). +Let $TwoForgottenMusicalSymbols = \p{Name=/^MUSICAL SYMBOL COMBINING (SPRECHGESANG STEM|AUGMENTATION DOT)$/} +Let $FourteenSpacingViramas = [\p{U15.1.0:ccc=9}&\p{U15.1.0:gc=Mc}] +Let $TwoVietnameseReadingMarks = [\p{U15.1.0:ccc=6}] +[\P{U4.0.0:ccc=0} - \p{U4.0.0:Grapheme_Extend}] = [$TwoForgottenMusicalSymbols \p{Name=/^MUSICAL SYMBOL COMBINING FLAG-[3-5]$/}] +[\P{U4.1.0:ccc=0} - \p{U4.1.0:GCB=Extend}] = $TwoForgottenMusicalSymbols +[\P{U15.1.0:ccc=0} - \p{U15.1.0:GCB=Extend}] = [$TwoForgottenMusicalSymbols $FourteenSpacingViramas $TwoVietnameseReadingMarks] + \P{ ccc=0} ⊆ \p{ GCB=Extend} + +# Characters that appear in non-initial position in the canonical decomposition +# of another character are either Extend, V, or T, so that sequences that are +# equivalent to a canonical composite are kept together by GB6..GB9. +# We only look at the starters, since we dealt with non-starters above. +# Characters that appear in non-initial position in the canonical decomposition +# of a primary composite are NFC_QC=Maybe. We would need to separately check +# the characters that appear in non-initial position in the canonical +# decomposition of a full composition exclusion. +# We would also need to separately check that the characters are T or V only +# appear in canonical decompositions where they follow an LV, LVT, V, or T, or +# an LV or V, respectively. +[\p{NFC_QC=Maybe}&\p{ccc=0}] ⊆ [\p{GCB=Extend}\p{GCB=T}\p{GCB=V}] + ########################## # Emoji ##########################