From bc18fb066adf3da535d0664c1d68a6820729e5f5 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Fri, 19 Jan 2024 16:54:56 +0100 Subject: [PATCH 01/10] UnicodeData.txt lines reconstructed from L2/23-284 --- unicodetools/data/ucd/dev/UnicodeData.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/unicodetools/data/ucd/dev/UnicodeData.txt b/unicodetools/data/ucd/dev/UnicodeData.txt index 4376ff723..714fe20e5 100644 --- a/unicodetools/data/ucd/dev/UnicodeData.txt +++ b/unicodetools/data/ucd/dev/UnicodeData.txt @@ -1,3 +1,5 @@ +16FF2;CHINESE SMALL SIMPLIFIED ER;Lo;0;L;;;;;N;;;;; +16FF3;CHINESE SMALL TRADITIONAL ER;Lo;0;L;;;;;N;;;;; 0000;;Cc;0;BN;;;;;N;NULL;;;; 0001;;Cc;0;BN;;;;;N;START OF HEADING;;;; 0002;;Cc;0;BN;;;;;N;START OF TEXT;;;; From 8e3085fa37e812a4f4c7e8503c577065b309eab4 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Fri, 19 Jan 2024 16:59:22 +0100 Subject: [PATCH 02/10] LineBreak.txt as described by L2/23-284 --- unicodetools/data/ucd/dev/LineBreak.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/unicodetools/data/ucd/dev/LineBreak.txt b/unicodetools/data/ucd/dev/LineBreak.txt index 7b47514bc..b23d916e7 100644 --- a/unicodetools/data/ucd/dev/LineBreak.txt +++ b/unicodetools/data/ucd/dev/LineBreak.txt @@ -1,3 +1,5 @@ +16FF2;NS +16FF3;NS # LineBreak-16.0.0.txt # Date: 2023-11-10, 22:25:50 GMT # © 2023 Unicode®, Inc. From 252c3bfda3987cb1bb780b5773bc64bbc0c1695c Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Fri, 19 Jan 2024 17:00:25 +0100 Subject: [PATCH 03/10] script --- unicodetools/data/ucd/dev/Scripts.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/unicodetools/data/ucd/dev/Scripts.txt b/unicodetools/data/ucd/dev/Scripts.txt index 73b0ec242..ee80b59cd 100644 --- a/unicodetools/data/ucd/dev/Scripts.txt +++ b/unicodetools/data/ucd/dev/Scripts.txt @@ -1,3 +1,4 @@ +16FF2..16FF3; Han # Scripts-16.0.0.txt # Date: 2023-11-10, 22:26:58 GMT # © 2023 Unicode®, Inc. From 016c6eb999221c9099170217f3fe24689f82b313 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Fri, 19 Jan 2024 17:02:32 +0100 Subject: [PATCH 04/10] Ideographic Extender --- unicodetools/data/ucd/dev/PropList.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/unicodetools/data/ucd/dev/PropList.txt b/unicodetools/data/ucd/dev/PropList.txt index 6225e4c33..51e1ac3d1 100644 --- a/unicodetools/data/ucd/dev/PropList.txt +++ b/unicodetools/data/ucd/dev/PropList.txt @@ -1,3 +1,5 @@ +16FF2..16FF3; Ideographic +16FF2..16FF3; Extender # PropList-16.0.0.txt # Date: 2023-11-10, 22:06:29 GMT # © 2023 Unicode®, Inc. From 73a387763e7bd72c69f385e5622ba63443fcbbc9 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Fri, 19 Jan 2024 17:07:40 +0100 Subject: [PATCH 05/10] Regenerate UCD --- unicodetools/data/ucd/dev/DerivedAge.txt | 5 +++-- .../data/ucd/dev/DerivedCoreProperties.txt | 20 ++++++++++++------- unicodetools/data/ucd/dev/EastAsianWidth.txt | 3 ++- unicodetools/data/ucd/dev/LineBreak.txt | 5 ++--- unicodetools/data/ucd/dev/PropList.txt | 10 +++++----- unicodetools/data/ucd/dev/Scripts.txt | 6 +++--- unicodetools/data/ucd/dev/UnicodeData.txt | 4 ++-- .../data/ucd/dev/VerticalOrientation.txt | 5 +++-- .../dev/auxiliary/SentenceBreakProperty.txt | 5 +++-- .../ucd/dev/extracted/DerivedBidiClass.txt | 5 +++-- .../dev/extracted/DerivedCombiningClass.txt | 5 +++-- .../dev/extracted/DerivedEastAsianWidth.txt | 5 +++-- .../dev/extracted/DerivedGeneralCategory.txt | 9 +++++---- .../ucd/dev/extracted/DerivedLineBreak.txt | 9 +++++---- .../data/ucd/dev/extracted/DerivedName.txt | 6 ++++-- 15 files changed, 59 insertions(+), 43 deletions(-) diff --git a/unicodetools/data/ucd/dev/DerivedAge.txt b/unicodetools/data/ucd/dev/DerivedAge.txt index d0f51612e..486add8ef 100644 --- a/unicodetools/data/ucd/dev/DerivedAge.txt +++ b/unicodetools/data/ucd/dev/DerivedAge.txt @@ -1,5 +1,5 @@ # DerivedAge-16.0.0.txt -# Date: 2023-11-10, 22:24:48 GMT +# Date: 2024-01-19, 16:06:19 GMT # © 2023 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see https://www.unicode.org/terms_of_use.html @@ -2042,6 +2042,7 @@ A7DA..A7DC ; 16.0 # [3] LATIN CAPITAL LETTER LAMBDA..LATIN CAPITAL LETTER L 11F5A ; 16.0 # KAWI SIGN NUKTA 16100..16139 ; 16.0 # [58] GURUNG KHEMA LETTER A..GURUNG KHEMA DIGIT NINE 16D40..16D79 ; 16.0 # [58] KIRAT RAI SIGN ANUSVARA..KIRAT RAI DIGIT NINE +16FF2..16FF3 ; 16.0 # [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER 18CFF ; 16.0 # KHITAN SMALL SCRIPT CHARACTER-18CFF 1CC00..1CCF9 ; 16.0 # [250] UP-POINTING GO-KART..OUTLINED DIGIT NINE 1CD00..1CEB3 ; 16.0 # [436] BLOCK OCTANT-3..BLACK RIGHT TRIANGLE CARET @@ -2058,6 +2059,6 @@ A7DA..A7DC ; 16.0 # [3] LATIN CAPITAL LETTER LAMBDA..LATIN CAPITAL LETTER L 1FAE9 ; 16.0 # FACE WITH BAGS UNDER EYES 1FBCB..1FBEF ; 16.0 # [37] WHITE CROSS MARK..TOP LEFT JUSTIFIED LOWER RIGHT QUARTER BLACK CIRCLE -# Total code points: 1192 +# Total code points: 1194 # EOF diff --git a/unicodetools/data/ucd/dev/DerivedCoreProperties.txt b/unicodetools/data/ucd/dev/DerivedCoreProperties.txt index 118350283..8c2a7ff37 100644 --- a/unicodetools/data/ucd/dev/DerivedCoreProperties.txt +++ b/unicodetools/data/ucd/dev/DerivedCoreProperties.txt @@ -1,5 +1,5 @@ # DerivedCoreProperties-16.0.0.txt -# Date: 2023-11-10, 22:25:26 GMT +# Date: 2024-01-19, 16:06:46 GMT # © 2023 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see https://www.unicode.org/terms_of_use.html @@ -1319,6 +1319,7 @@ FFDA..FFDC ; Alphabetic # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANG 16FE0..16FE1 ; Alphabetic # Lm [2] TANGUT ITERATION MARK..NUSHU ITERATION MARK 16FE3 ; Alphabetic # Lm OLD CHINESE ITERATION MARK 16FF0..16FF1 ; Alphabetic # Mc [2] VIETNAMESE ALTERNATE READING MARK CA..VIETNAMESE ALTERNATE READING MARK NHAY +16FF2..16FF3 ; Alphabetic # Lo [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER 17000..187F7 ; Alphabetic # Lo [6136] TANGUT IDEOGRAPH-17000..TANGUT IDEOGRAPH-187F7 18800..18CD5 ; Alphabetic # Lo [1238] TANGUT COMPONENT-001..KHITAN SMALL SCRIPT CHARACTER-18CD5 18CFF..18D08 ; Alphabetic # Lo [10] KHITAN SMALL SCRIPT CHARACTER-18CFF..TANGUT IDEOGRAPH-18D08 @@ -1440,7 +1441,7 @@ FFDA..FFDC ; Alphabetic # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANG 30000..3134A ; Alphabetic # Lo [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A 31350..323AF ; Alphabetic # Lo [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF -# Total code points: 138766 +# Total code points: 138768 # ================================================ @@ -6850,6 +6851,7 @@ FFDA..FFDC ; ID_Start # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL 16F93..16F9F ; ID_Start # Lm [13] MIAO LETTER TONE-2..MIAO LETTER REFORMED TONE-8 16FE0..16FE1 ; ID_Start # Lm [2] TANGUT ITERATION MARK..NUSHU ITERATION MARK 16FE3 ; ID_Start # Lm OLD CHINESE ITERATION MARK +16FF2..16FF3 ; ID_Start # Lo [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER 17000..187F7 ; ID_Start # Lo [6136] TANGUT IDEOGRAPH-17000..TANGUT IDEOGRAPH-187F7 18800..18CD5 ; ID_Start # Lo [1238] TANGUT COMPONENT-001..KHITAN SMALL SCRIPT CHARACTER-18CD5 18CFF..18D08 ; ID_Start # Lo [10] KHITAN SMALL SCRIPT CHARACTER-18CFF..TANGUT IDEOGRAPH-18D08 @@ -6960,7 +6962,7 @@ FFDA..FFDC ; ID_Start # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL 30000..3134A ; ID_Start # Lo [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A 31350..323AF ; ID_Start # Lo [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF -# Total code points: 137276 +# Total code points: 137278 # ================================================ @@ -8219,6 +8221,7 @@ FFDA..FFDC ; ID_Continue # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HAN 16FE3 ; ID_Continue # Lm OLD CHINESE ITERATION MARK 16FE4 ; ID_Continue # Mn KHITAN SMALL SCRIPT FILLER 16FF0..16FF1 ; ID_Continue # Mc [2] VIETNAMESE ALTERNATE READING MARK CA..VIETNAMESE ALTERNATE READING MARK NHAY +16FF2..16FF3 ; ID_Continue # Lo [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER 17000..187F7 ; ID_Continue # Lo [6136] TANGUT IDEOGRAPH-17000..TANGUT IDEOGRAPH-187F7 18800..18CD5 ; ID_Continue # Lo [1238] TANGUT COMPONENT-001..KHITAN SMALL SCRIPT CHARACTER-18CD5 18CFF..18D08 ; ID_Continue # Lo [10] KHITAN SMALL SCRIPT CHARACTER-18CFF..TANGUT IDEOGRAPH-18D08 @@ -8367,7 +8370,7 @@ FFDA..FFDC ; ID_Continue # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HAN 31350..323AF ; ID_Continue # Lo [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF E0100..E01EF ; ID_Continue # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256 -# Total code points: 140548 +# Total code points: 140550 # ================================================ @@ -9034,6 +9037,7 @@ FFDA..FFDC ; XID_Start # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGU 16F93..16F9F ; XID_Start # Lm [13] MIAO LETTER TONE-2..MIAO LETTER REFORMED TONE-8 16FE0..16FE1 ; XID_Start # Lm [2] TANGUT ITERATION MARK..NUSHU ITERATION MARK 16FE3 ; XID_Start # Lm OLD CHINESE ITERATION MARK +16FF2..16FF3 ; XID_Start # Lo [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER 17000..187F7 ; XID_Start # Lo [6136] TANGUT IDEOGRAPH-17000..TANGUT IDEOGRAPH-187F7 18800..18CD5 ; XID_Start # Lo [1238] TANGUT COMPONENT-001..KHITAN SMALL SCRIPT CHARACTER-18CD5 18CFF..18D08 ; XID_Start # Lo [10] KHITAN SMALL SCRIPT CHARACTER-18CFF..TANGUT IDEOGRAPH-18D08 @@ -9144,7 +9148,7 @@ FFDA..FFDC ; XID_Start # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGU 30000..3134A ; XID_Start # Lo [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A 31350..323AF ; XID_Start # Lo [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF -# Total code points: 137253 +# Total code points: 137255 # ================================================ @@ -10404,6 +10408,7 @@ FFDA..FFDC ; XID_Continue # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HA 16FE3 ; XID_Continue # Lm OLD CHINESE ITERATION MARK 16FE4 ; XID_Continue # Mn KHITAN SMALL SCRIPT FILLER 16FF0..16FF1 ; XID_Continue # Mc [2] VIETNAMESE ALTERNATE READING MARK CA..VIETNAMESE ALTERNATE READING MARK NHAY +16FF2..16FF3 ; XID_Continue # Lo [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER 17000..187F7 ; XID_Continue # Lo [6136] TANGUT IDEOGRAPH-17000..TANGUT IDEOGRAPH-187F7 18800..18CD5 ; XID_Continue # Lo [1238] TANGUT COMPONENT-001..KHITAN SMALL SCRIPT CHARACTER-18CD5 18CFF..18D08 ; XID_Continue # Lo [10] KHITAN SMALL SCRIPT CHARACTER-18CFF..TANGUT IDEOGRAPH-18D08 @@ -10552,7 +10557,7 @@ FFDA..FFDC ; XID_Continue # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HA 31350..323AF ; XID_Continue # Lo [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF E0100..E01EF ; XID_Continue # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256 -# Total code points: 140529 +# Total code points: 140531 # ================================================ @@ -12593,6 +12598,7 @@ FFFC..FFFD ; Grapheme_Base # So [2] OBJECT REPLACEMENT CHARACTER..REPLACEME 16FE2 ; Grapheme_Base # Po OLD CHINESE HOOK MARK 16FE3 ; Grapheme_Base # Lm OLD CHINESE ITERATION MARK 16FF0..16FF1 ; Grapheme_Base # Mc [2] VIETNAMESE ALTERNATE READING MARK CA..VIETNAMESE ALTERNATE READING MARK NHAY +16FF2..16FF3 ; Grapheme_Base # Lo [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER 17000..187F7 ; Grapheme_Base # Lo [6136] TANGUT IDEOGRAPH-17000..TANGUT IDEOGRAPH-187F7 18800..18CD5 ; Grapheme_Base # Lo [1238] TANGUT COMPONENT-001..KHITAN SMALL SCRIPT CHARACTER-18CD5 18CFF..18D08 ; Grapheme_Base # Lo [10] KHITAN SMALL SCRIPT CHARACTER-18CFF..TANGUT IDEOGRAPH-18D08 @@ -12800,7 +12806,7 @@ FFFC..FFFD ; Grapheme_Base # So [2] OBJECT REPLACEMENT CHARACTER..REPLACEME 30000..3134A ; Grapheme_Base # Lo [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A 31350..323AF ; Grapheme_Base # Lo [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF -# Total code points: 148770 +# Total code points: 148772 # ================================================ diff --git a/unicodetools/data/ucd/dev/EastAsianWidth.txt b/unicodetools/data/ucd/dev/EastAsianWidth.txt index a2faf7b49..806e31e7e 100644 --- a/unicodetools/data/ucd/dev/EastAsianWidth.txt +++ b/unicodetools/data/ucd/dev/EastAsianWidth.txt @@ -1,5 +1,5 @@ # EastAsianWidth-16.0.0.txt -# Date: 2023-11-10, 22:25:47 GMT +# Date: 2024-01-19, 16:06:53 GMT # © 2023 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see https://www.unicode.org/terms_of_use.html @@ -2355,6 +2355,7 @@ FFFD ; A # So REPLACEMENT CHARACTER 16FE3 ; W # Lm OLD CHINESE ITERATION MARK 16FE4 ; W # Mn KHITAN SMALL SCRIPT FILLER 16FF0..16FF1 ; W # Mc [2] VIETNAMESE ALTERNATE READING MARK CA..VIETNAMESE ALTERNATE READING MARK NHAY +16FF2..16FF3 ; N # Lo [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER 17000..187F7 ; W # Lo [6136] TANGUT IDEOGRAPH-17000..TANGUT IDEOGRAPH-187F7 18800..18AFF ; W # Lo [768] TANGUT COMPONENT-001..TANGUT COMPONENT-768 18B00..18CD5 ; W # Lo [470] KHITAN SMALL SCRIPT CHARACTER-18B00..KHITAN SMALL SCRIPT CHARACTER-18CD5 diff --git a/unicodetools/data/ucd/dev/LineBreak.txt b/unicodetools/data/ucd/dev/LineBreak.txt index b23d916e7..93073d5e8 100644 --- a/unicodetools/data/ucd/dev/LineBreak.txt +++ b/unicodetools/data/ucd/dev/LineBreak.txt @@ -1,7 +1,5 @@ -16FF2;NS -16FF3;NS # LineBreak-16.0.0.txt -# Date: 2023-11-10, 22:25:50 GMT +# Date: 2024-01-19, 16:06:54 GMT # © 2023 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see https://www.unicode.org/terms_of_use.html @@ -3264,6 +3262,7 @@ FFFD ; AI # So REPLACEMENT CHARACTER 16FE3 ; NS # Lm OLD CHINESE ITERATION MARK 16FE4 ; GL # Mn KHITAN SMALL SCRIPT FILLER 16FF0..16FF1 ; CM # Mc [2] VIETNAMESE ALTERNATE READING MARK CA..VIETNAMESE ALTERNATE READING MARK NHAY +16FF2..16FF3 ; NS # Lo [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER 17000..187F7 ; ID # Lo [6136] TANGUT IDEOGRAPH-17000..TANGUT IDEOGRAPH-187F7 18800..18AFF ; ID # Lo [768] TANGUT COMPONENT-001..TANGUT COMPONENT-768 18B00..18CD5 ; AL # Lo [470] KHITAN SMALL SCRIPT CHARACTER-18B00..KHITAN SMALL SCRIPT CHARACTER-18CD5 diff --git a/unicodetools/data/ucd/dev/PropList.txt b/unicodetools/data/ucd/dev/PropList.txt index 51e1ac3d1..8c701f0c9 100644 --- a/unicodetools/data/ucd/dev/PropList.txt +++ b/unicodetools/data/ucd/dev/PropList.txt @@ -1,7 +1,5 @@ -16FF2..16FF3; Ideographic -16FF2..16FF3; Extender # PropList-16.0.0.txt -# Date: 2023-11-10, 22:06:29 GMT +# Date: 2024-01-19, 16:06:58 GMT # © 2023 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see https://www.unicode.org/terms_of_use.html @@ -865,6 +863,7 @@ FB1E ; Other_Alphabetic # Mn HEBREW POINT JUDEO-SPANISH VARIKA F900..FA6D ; Ideographic # Lo [366] CJK COMPATIBILITY IDEOGRAPH-F900..CJK COMPATIBILITY IDEOGRAPH-FA6D FA70..FAD9 ; Ideographic # Lo [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COMPATIBILITY IDEOGRAPH-FAD9 16FE4 ; Ideographic # Mn KHITAN SMALL SCRIPT FILLER +16FF2..16FF3 ; Ideographic # Lo [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER 17000..187F7 ; Ideographic # Lo [6136] TANGUT IDEOGRAPH-17000..TANGUT IDEOGRAPH-187F7 18800..18CD5 ; Ideographic # Lo [1238] TANGUT COMPONENT-001..KHITAN SMALL SCRIPT CHARACTER-18CD5 18D00..18D08 ; Ideographic # Lo [9] TANGUT IDEOGRAPH-18D00..TANGUT IDEOGRAPH-18D08 @@ -879,7 +878,7 @@ FA70..FAD9 ; Ideographic # Lo [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COM 30000..3134A ; Ideographic # Lo [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A 31350..323AF ; Ideographic # Lo [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF -# Total code points: 106476 +# Total code points: 106478 # ================================================ @@ -1171,11 +1170,12 @@ FF70 ; Extender # Lm HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND 16B42..16B43 ; Extender # Lm [2] PAHAWH HMONG SIGN VOS NRUA..PAHAWH HMONG SIGN IB YAM 16FE0..16FE1 ; Extender # Lm [2] TANGUT ITERATION MARK..NUSHU ITERATION MARK 16FE3 ; Extender # Lm OLD CHINESE ITERATION MARK +16FF2..16FF3 ; Extender # Lo [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER 1E13C..1E13D ; Extender # Lm [2] NYIAKENG PUACHUE HMONG SIGN XW XW..NYIAKENG PUACHUE HMONG SYLLABLE LENGTHENER 1E5EF ; Extender # Mn OL ONAL SIGN IKIR 1E944..1E946 ; Extender # Mn [3] ADLAM ALIF LENGTHENER..ADLAM GEMINATION MARK -# Total code points: 56 +# Total code points: 58 # ================================================ diff --git a/unicodetools/data/ucd/dev/Scripts.txt b/unicodetools/data/ucd/dev/Scripts.txt index ee80b59cd..fb543df9b 100644 --- a/unicodetools/data/ucd/dev/Scripts.txt +++ b/unicodetools/data/ucd/dev/Scripts.txt @@ -1,6 +1,5 @@ -16FF2..16FF3; Han # Scripts-16.0.0.txt -# Date: 2023-11-10, 22:26:58 GMT +# Date: 2024-01-19, 16:07:17 GMT # © 2023 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see https://www.unicode.org/terms_of_use.html @@ -1595,6 +1594,7 @@ FA70..FAD9 ; Han # Lo [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COMPATIBILI 16FE2 ; Han # Po OLD CHINESE HOOK MARK 16FE3 ; Han # Lm OLD CHINESE ITERATION MARK 16FF0..16FF1 ; Han # Mc [2] VIETNAMESE ALTERNATE READING MARK CA..VIETNAMESE ALTERNATE READING MARK NHAY +16FF2..16FF3 ; Han # Lo [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER 20000..2A6DF ; Han # Lo [42720] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DF 2A700..2B739 ; Han # Lo [4154] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B739 2B740..2B81D ; Han # Lo [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D @@ -1605,7 +1605,7 @@ FA70..FAD9 ; Han # Lo [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COMPATIBILI 30000..3134A ; Han # Lo [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A 31350..323AF ; Han # Lo [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF -# Total code points: 99030 +# Total code points: 99032 # ================================================ diff --git a/unicodetools/data/ucd/dev/UnicodeData.txt b/unicodetools/data/ucd/dev/UnicodeData.txt index 714fe20e5..7a703e1cb 100644 --- a/unicodetools/data/ucd/dev/UnicodeData.txt +++ b/unicodetools/data/ucd/dev/UnicodeData.txt @@ -1,5 +1,3 @@ -16FF2;CHINESE SMALL SIMPLIFIED ER;Lo;0;L;;;;;N;;;;; -16FF3;CHINESE SMALL TRADITIONAL ER;Lo;0;L;;;;;N;;;;; 0000;;Cc;0;BN;;;;;N;NULL;;;; 0001;;Cc;0;BN;;;;;N;START OF HEADING;;;; 0002;;Cc;0;BN;;;;;N;START OF TEXT;;;; @@ -26289,6 +26287,8 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;; 16FE4;KHITAN SMALL SCRIPT FILLER;Mn;0;NSM;;;;;N;;;;; 16FF0;VIETNAMESE ALTERNATE READING MARK CA;Mc;6;L;;;;;N;;;;; 16FF1;VIETNAMESE ALTERNATE READING MARK NHAY;Mc;6;L;;;;;N;;;;; +16FF2;CHINESE SMALL SIMPLIFIED ER;Lo;0;L;;;;;N;;;;; +16FF3;CHINESE SMALL TRADITIONAL ER;Lo;0;L;;;;;N;;;;; 17000;;Lo;0;L;;;;;N;;;;; 187F7;;Lo;0;L;;;;;N;;;;; 18800;TANGUT COMPONENT-001;Lo;0;L;;;;;N;;;;; diff --git a/unicodetools/data/ucd/dev/VerticalOrientation.txt b/unicodetools/data/ucd/dev/VerticalOrientation.txt index 005f57218..39a3a0850 100644 --- a/unicodetools/data/ucd/dev/VerticalOrientation.txt +++ b/unicodetools/data/ucd/dev/VerticalOrientation.txt @@ -1,5 +1,5 @@ # VerticalOrientation-16.0.0.txt -# Date: 2023-11-10, 22:27:08 GMT +# Date: 2024-01-19, 16:07:20 GMT # © 2023 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see https://www.unicode.org/terms_of_use.html @@ -2186,7 +2186,8 @@ FFFC..FFFD ; U # So [2] OBJECT REPLACEMENT CHARACTER..REPLACEMENT CHARA 16FE4 ; U # Mn KHITAN SMALL SCRIPT FILLER 16FE5..16FEF ; U # Cn [11] .. 16FF0..16FF1 ; U # Mc [2] VIETNAMESE ALTERNATE READING MARK CA..VIETNAMESE ALTERNATE READING MARK NHAY -16FF2..16FFF ; U # Cn [14] .. +16FF2..16FF3 ; U # Lo [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER +16FF4..16FFF ; U # Cn [12] .. 17000..187F7 ; U # Lo [6136] TANGUT IDEOGRAPH-17000..TANGUT IDEOGRAPH-187F7 187F8..187FF ; U # Cn [8] .. 18800..18AFF ; U # Lo [768] TANGUT COMPONENT-001..TANGUT COMPONENT-768 diff --git a/unicodetools/data/ucd/dev/auxiliary/SentenceBreakProperty.txt b/unicodetools/data/ucd/dev/auxiliary/SentenceBreakProperty.txt index 9138d7847..9c906fe43 100644 --- a/unicodetools/data/ucd/dev/auxiliary/SentenceBreakProperty.txt +++ b/unicodetools/data/ucd/dev/auxiliary/SentenceBreakProperty.txt @@ -1,5 +1,5 @@ # SentenceBreakProperty-16.0.0.txt -# Date: 2023-11-10, 22:27:00 GMT +# Date: 2024-01-19, 16:07:18 GMT # © 2023 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see https://www.unicode.org/terms_of_use.html @@ -2509,6 +2509,7 @@ FFDA..FFDC ; OLetter # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL 16F93..16F9F ; OLetter # Lm [13] MIAO LETTER TONE-2..MIAO LETTER REFORMED TONE-8 16FE0..16FE1 ; OLetter # Lm [2] TANGUT ITERATION MARK..NUSHU ITERATION MARK 16FE3 ; OLetter # Lm OLD CHINESE ITERATION MARK +16FF2..16FF3 ; OLetter # Lo [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER 17000..187F7 ; OLetter # Lo [6136] TANGUT IDEOGRAPH-17000..TANGUT IDEOGRAPH-187F7 18800..18CD5 ; OLetter # Lo [1238] TANGUT COMPONENT-001..KHITAN SMALL SCRIPT CHARACTER-18CD5 18CFF..18D08 ; OLetter # Lo [10] KHITAN SMALL SCRIPT CHARACTER-18CFF..TANGUT IDEOGRAPH-18D08 @@ -2584,7 +2585,7 @@ FFDA..FFDC ; OLetter # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL 30000..3134A ; OLetter # Lo [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A 31350..323AF ; OLetter # Lo [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF -# Total code points: 132915 +# Total code points: 132917 # ================================================ diff --git a/unicodetools/data/ucd/dev/extracted/DerivedBidiClass.txt b/unicodetools/data/ucd/dev/extracted/DerivedBidiClass.txt index 03c8c7701..365296759 100644 --- a/unicodetools/data/ucd/dev/extracted/DerivedBidiClass.txt +++ b/unicodetools/data/ucd/dev/extracted/DerivedBidiClass.txt @@ -1,5 +1,5 @@ # DerivedBidiClass-16.0.0.txt -# Date: 2023-11-10, 22:25:22 GMT +# Date: 2024-01-19, 16:06:44 GMT # © 2023 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see https://www.unicode.org/terms_of_use.html @@ -1102,6 +1102,7 @@ FFDA..FFDC ; L # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER 16FE0..16FE1 ; L # Lm [2] TANGUT ITERATION MARK..NUSHU ITERATION MARK 16FE3 ; L # Lm OLD CHINESE ITERATION MARK 16FF0..16FF1 ; L # Mc [2] VIETNAMESE ALTERNATE READING MARK CA..VIETNAMESE ALTERNATE READING MARK NHAY +16FF2..16FF3 ; L # Lo [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER 17000..187F7 ; L # Lo [6136] TANGUT IDEOGRAPH-17000..TANGUT IDEOGRAPH-187F7 18800..18CD5 ; L # Lo [1238] TANGUT COMPONENT-001..KHITAN SMALL SCRIPT CHARACTER-18CD5 18CFF..18D08 ; L # Lo [10] KHITAN SMALL SCRIPT CHARACTER-18CFF..TANGUT IDEOGRAPH-18D08 @@ -1218,7 +1219,7 @@ FFDA..FFDC ; L # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER F0000..FFFFD ; L # Co [65534] .. 100000..10FFFD; L # Co [65534] .. -# The above property value applies to 819344 code points not listed here. +# The above property value applies to 819342 code points not listed here. # Total code points: 1095518 # ================================================ diff --git a/unicodetools/data/ucd/dev/extracted/DerivedCombiningClass.txt b/unicodetools/data/ucd/dev/extracted/DerivedCombiningClass.txt index b80bb140e..9339b0392 100644 --- a/unicodetools/data/ucd/dev/extracted/DerivedCombiningClass.txt +++ b/unicodetools/data/ucd/dev/extracted/DerivedCombiningClass.txt @@ -1,5 +1,5 @@ # DerivedCombiningClass-16.0.0.txt -# Date: 2023-11-10, 22:25:25 GMT +# Date: 2024-01-19, 16:06:46 GMT # © 2023 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see https://www.unicode.org/terms_of_use.html @@ -1838,6 +1838,7 @@ FFFC..FFFD ; 0 # So [2] OBJECT REPLACEMENT CHARACTER..REPLACEMENT CHARACTER 16FE2 ; 0 # Po OLD CHINESE HOOK MARK 16FE3 ; 0 # Lm OLD CHINESE ITERATION MARK 16FE4 ; 0 # Mn KHITAN SMALL SCRIPT FILLER +16FF2..16FF3 ; 0 # Lo [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER 17000..187F7 ; 0 # Lo [6136] TANGUT IDEOGRAPH-17000..TANGUT IDEOGRAPH-187F7 18800..18CD5 ; 0 # Lo [1238] TANGUT COMPONENT-001..KHITAN SMALL SCRIPT CHARACTER-18CD5 18CFF..18D08 ; 0 # Lo [10] KHITAN SMALL SCRIPT CHARACTER-18CFF..TANGUT IDEOGRAPH-18D08 @@ -2059,7 +2060,7 @@ E0100..E01EF ; 0 # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256 F0000..FFFFD ; 0 # Co [65534] .. 100000..10FFFD; 0 # Co [65534] .. -# The above property value applies to 825574 code points not listed here. +# The above property value applies to 825572 code points not listed here. # Total code points: 1113178 # ================================================ diff --git a/unicodetools/data/ucd/dev/extracted/DerivedEastAsianWidth.txt b/unicodetools/data/ucd/dev/extracted/DerivedEastAsianWidth.txt index d06e0e6e8..7b928a72e 100644 --- a/unicodetools/data/ucd/dev/extracted/DerivedEastAsianWidth.txt +++ b/unicodetools/data/ucd/dev/extracted/DerivedEastAsianWidth.txt @@ -1,5 +1,5 @@ # DerivedEastAsianWidth-16.0.0.txt -# Date: 2023-11-10, 22:25:29 GMT +# Date: 2024-01-19, 16:06:48 GMT # © 2023 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see https://www.unicode.org/terms_of_use.html @@ -1872,6 +1872,7 @@ FFFC ; N # So OBJECT REPLACEMENT CHARACTER 16F51..16F87 ; N # Mc [55] MIAO SIGN ASPIRATION..MIAO VOWEL SIGN UI 16F8F..16F92 ; N # Mn [4] MIAO TONE RIGHT..MIAO TONE BELOW 16F93..16F9F ; N # Lm [13] MIAO LETTER TONE-2..MIAO LETTER REFORMED TONE-8 +16FF2..16FF3 ; N # Lo [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER 1BC00..1BC6A ; N # Lo [107] DUPLOYAN LETTER H..DUPLOYAN LETTER VOCALIC M 1BC70..1BC7C ; N # Lo [13] DUPLOYAN AFFIX LEFT HORIZONTAL SECANT..DUPLOYAN AFFIX ATTACHED TANGENT HOOK 1BC80..1BC88 ; N # Lo [9] DUPLOYAN AFFIX HIGH ACUTE..DUPLOYAN AFFIX HIGH VERTICAL @@ -2102,7 +2103,7 @@ FFFC ; N # So OBJECT REPLACEMENT CHARACTER E0001 ; N # Cf LANGUAGE TAG E0020..E007F ; N # Cf [96] TAG SPACE..CANCEL TAG -# The above property value applies to 765092 code points not listed here. +# The above property value applies to 765090 code points not listed here. # Total code points: 792608 # ================================================ diff --git a/unicodetools/data/ucd/dev/extracted/DerivedGeneralCategory.txt b/unicodetools/data/ucd/dev/extracted/DerivedGeneralCategory.txt index 44fd2035f..5a213c3aa 100644 --- a/unicodetools/data/ucd/dev/extracted/DerivedGeneralCategory.txt +++ b/unicodetools/data/ucd/dev/extracted/DerivedGeneralCategory.txt @@ -1,5 +1,5 @@ # DerivedGeneralCategory-16.0.0.txt -# Date: 2023-11-10, 22:25:30 GMT +# Date: 2024-01-19, 16:06:49 GMT # © 2023 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see https://www.unicode.org/terms_of_use.html @@ -577,7 +577,7 @@ FFFE..FFFF ; Cn # [2] .. 16F88..16F8E ; Cn # [7] .. 16FA0..16FDF ; Cn # [64] .. 16FE5..16FEF ; Cn # [11] .. -16FF2..16FFF ; Cn # [14] .. +16FF4..16FFF ; Cn # [12] .. 187F8..187FF ; Cn # [8] .. 18CD6..18CFE ; Cn # [41] .. 18D09..1AFEF ; Cn # [8935] .. @@ -746,7 +746,7 @@ E01F0..EFFFF ; Cn # [65040] .. FFFFE..FFFFF ; Cn # [2] .. 10FFFE..10FFFF; Cn # [2] .. -# Total code points: 823526 +# Total code points: 823524 # ================================================ @@ -2637,6 +2637,7 @@ FFDA..FFDC ; Lo # [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I 16D43..16D6A ; Lo # [40] KIRAT RAI LETTER A..KIRAT RAI VOWEL SIGN AU 16F00..16F4A ; Lo # [75] MIAO LETTER PA..MIAO LETTER RTE 16F50 ; Lo # MIAO LETTER NASALIZATION +16FF2..16FF3 ; Lo # [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER 17000..187F7 ; Lo # [6136] TANGUT IDEOGRAPH-17000..TANGUT IDEOGRAPH-187F7 18800..18CD5 ; Lo # [1238] TANGUT COMPONENT-001..KHITAN SMALL SCRIPT CHARACTER-18CD5 18CFF..18D08 ; Lo # [10] KHITAN SMALL SCRIPT CHARACTER-18CFF..TANGUT IDEOGRAPH-18D08 @@ -2706,7 +2707,7 @@ FFDA..FFDC ; Lo # [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I 30000..3134A ; Lo # [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A 31350..323AF ; Lo # [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF -# Total code points: 132484 +# Total code points: 132486 # ================================================ diff --git a/unicodetools/data/ucd/dev/extracted/DerivedLineBreak.txt b/unicodetools/data/ucd/dev/extracted/DerivedLineBreak.txt index 779b3bbe8..28d154ff2 100644 --- a/unicodetools/data/ucd/dev/extracted/DerivedLineBreak.txt +++ b/unicodetools/data/ucd/dev/extracted/DerivedLineBreak.txt @@ -1,5 +1,5 @@ # DerivedLineBreak-16.0.0.txt -# Date: 2023-11-10, 22:25:34 GMT +# Date: 2024-01-19, 16:06:50 GMT # © 2023 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see https://www.unicode.org/terms_of_use.html @@ -69,8 +69,8 @@ E000..F8FF ; XX # Co [6400] .. F0000..FFFFD ; XX # Co [65534] .. 100000..10FFFD; XX # Co [65534] .. -# The above property value applies to 761646 code points not listed here. -# Total code points: 899114 +# The above property value applies to 761644 code points not listed here. +# Total code points: 899112 # ================================================ @@ -352,9 +352,10 @@ FF9E..FF9F ; NS # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KA 16FE0..16FE1 ; NS # Lm [2] TANGUT ITERATION MARK..NUSHU ITERATION MARK 16FE2 ; NS # Po OLD CHINESE HOOK MARK 16FE3 ; NS # Lm OLD CHINESE ITERATION MARK +16FF2..16FF3 ; NS # Lo [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER 1F679..1F67B ; NS # So [3] HEAVY INTERROBANG ORNAMENT..HEAVY SANS-SERIF INTERROBANG ORNAMENT -# Total code points: 33 +# Total code points: 35 # ================================================ diff --git a/unicodetools/data/ucd/dev/extracted/DerivedName.txt b/unicodetools/data/ucd/dev/extracted/DerivedName.txt index 145e66ed9..b364f8546 100644 --- a/unicodetools/data/ucd/dev/extracted/DerivedName.txt +++ b/unicodetools/data/ucd/dev/extracted/DerivedName.txt @@ -1,5 +1,5 @@ # DerivedName-16.0.0.txt -# Date: 2023-11-10, 22:25:34 GMT +# Date: 2024-01-19, 16:06:51 GMT # © 2023 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see https://www.unicode.org/terms_of_use.html @@ -36952,6 +36952,8 @@ FFFD ; REPLACEMENT CHARACTER 16FE4 ; KHITAN SMALL SCRIPT FILLER 16FF0 ; VIETNAMESE ALTERNATE READING MARK CA 16FF1 ; VIETNAMESE ALTERNATE READING MARK NHAY +16FF2 ; CHINESE SMALL SIMPLIFIED ER +16FF3 ; CHINESE SMALL TRADITIONAL ER 17000..187F7 ; TANGUT IDEOGRAPH-* 18800 ; TANGUT COMPONENT-001 18801 ; TANGUT COMPONENT-002 @@ -45368,6 +45370,6 @@ E01ED ; VARIATION SELECTOR-254 E01EE ; VARIATION SELECTOR-255 E01EF ; VARIATION SELECTOR-256 -# Total code points: 151005 +# Total code points: 151007 # EOF From 985c11d2e6dc3eb7e83b857d77a5c47db0084293 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Fri, 19 Jan 2024 20:51:32 +0100 Subject: [PATCH 06/10] ideohack --- .../resources/org/unicode/text/UCD/UnicodeInvariantTest.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt index 34928fd9f..c99538696 100644 --- a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt +++ b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt @@ -886,7 +886,7 @@ $punct ⊇ [[\u0021-\u007E] - [0-9 A-Z a-z]] # Ideographic invariants -Let $ideohack = [〆 〇 〡-〩] +Let $ideohack = [〆 〇 〡-〩 \p{Name=/^CHINESE SMALL (SIMPLIFIED|TRADITIONAL) ER$/}] [\p{ideographic}-\p{sc=tangut}-\p{sc=nushu}-\p{sc=Khitan_Small_Script}-\p{nfkcqc=n}-$ideohack] = \p{Unified_Ideograph} # 10.0 added Nushu From e0814bf80f257554a1c57c57afb8d45d9de4589d Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Fri, 19 Jan 2024 20:58:31 +0100 Subject: [PATCH 07/10] Remove changelog --- .../resources/org/unicode/text/UCD/UnicodeInvariantTest.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt index c99538696..dc44db4b5 100644 --- a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt +++ b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt @@ -889,8 +889,6 @@ $punct ⊇ [[\u0021-\u007E] - [0-9 A-Z a-z]] Let $ideohack = [〆 〇 〡-〩 \p{Name=/^CHINESE SMALL (SIMPLIFIED|TRADITIONAL) ER$/}] [\p{ideographic}-\p{sc=tangut}-\p{sc=nushu}-\p{sc=Khitan_Small_Script}-\p{nfkcqc=n}-$ideohack] = \p{Unified_Ideograph} -# 10.0 added Nushu -# 13.0 added Khitan_Small_Script \p{Unified_Ideograph} ⊂ \p{kRSUnicode=/./} \p{kRSUnicode=/./} = [\p{Block=/^CJK.(Unified|Compatibility).Ideographs/} - \p{gc=Cn}] From e7992e92ceb99b5dbd97fa6fdab5a23329bccbcd Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 24 Jul 2024 18:20:21 +0200 Subject: [PATCH 08/10] gc=Lm for the small ers --- unicodetools/data/ucd/dev/UnicodeData.txt | 4 ++-- .../resources/org/unicode/text/UCD/UnicodeInvariantTest.txt | 6 +++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/unicodetools/data/ucd/dev/UnicodeData.txt b/unicodetools/data/ucd/dev/UnicodeData.txt index 98b678a33..d7bc5eb37 100644 --- a/unicodetools/data/ucd/dev/UnicodeData.txt +++ b/unicodetools/data/ucd/dev/UnicodeData.txt @@ -30280,8 +30280,8 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;; 16FE4;KHITAN SMALL SCRIPT FILLER;Mn;0;NSM;;;;;N;;;;; 16FF0;VIETNAMESE ALTERNATE READING MARK CA;Mc;6;L;;;;;N;;;;; 16FF1;VIETNAMESE ALTERNATE READING MARK NHAY;Mc;6;L;;;;;N;;;;; -16FF2;CHINESE SMALL SIMPLIFIED ER;Lo;0;L;;;;;N;;;;; -16FF3;CHINESE SMALL TRADITIONAL ER;Lo;0;L;;;;;N;;;;; +16FF2;CHINESE SMALL SIMPLIFIED ER;Lm;0;L;;;;;N;;;;; +16FF3;CHINESE SMALL TRADITIONAL ER;Lm;0;L;;;;;N;;;;; 17000;;Lo;0;L;;;;;N;;;;; 187F7;;Lo;0;L;;;;;N;;;;; 18800;TANGUT COMPONENT-001;Lo;0;L;;;;;N;;;;; diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt index b2e0f9163..754b97785 100644 --- a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt +++ b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt @@ -1195,7 +1195,11 @@ $punct ⊇ [[\u0021-\u007E] - [0-9 A-Z a-z]] # The Khitan Small Script filler is a Nonspacing Mark. # The other characters are numerals (the Hangzhou ten through thirty are compatibility decomposable, # but not the one through nine) and have Script=Han. -Let $NonOtherLetterIdeographs := [\N{KHITAN SMALL SCRIPT FILLER} 〇 〡-〩 〸-〺] +Let $NonOtherLetterIdeographs := [ + \N{KHITAN SMALL SCRIPT FILLER} + 〇 〡-〩 〸-〺 + \p{NAME=/^CHINESE SMALL (SIMPLIFIED|TRADITIONAL) ER$/} +] $NonOtherLetterIdeographs = [\p{Ideographic} - \p{gc=Lo}] # Ideographic closing mark, gc=Lo. Let $CommonIdeographs := [〆] From dc19a4c33a3ffb6bd88b7677a87abf3141c1f236 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 24 Jul 2024 18:30:07 +0200 Subject: [PATCH 09/10] Regenerate UCD --- .../data/ucd/dev/DerivedCoreProperties.txt | 17 +++++++++-------- unicodetools/data/ucd/dev/EastAsianWidth.txt | 4 ++-- unicodetools/data/ucd/dev/LineBreak.txt | 4 ++-- unicodetools/data/ucd/dev/PropList.txt | 6 +++--- unicodetools/data/ucd/dev/Scripts.txt | 4 ++-- .../data/ucd/dev/VerticalOrientation.txt | 4 ++-- .../ucd/dev/auxiliary/SentenceBreakProperty.txt | 4 ++-- .../data/ucd/dev/extracted/DerivedBidiClass.txt | 4 ++-- .../ucd/dev/extracted/DerivedCombiningClass.txt | 4 ++-- .../ucd/dev/extracted/DerivedEastAsianWidth.txt | 4 ++-- .../dev/extracted/DerivedGeneralCategory.txt | 8 ++++---- .../data/ucd/dev/extracted/DerivedLineBreak.txt | 4 ++-- 12 files changed, 34 insertions(+), 33 deletions(-) diff --git a/unicodetools/data/ucd/dev/DerivedCoreProperties.txt b/unicodetools/data/ucd/dev/DerivedCoreProperties.txt index 69bd990de..778ebfd02 100644 --- a/unicodetools/data/ucd/dev/DerivedCoreProperties.txt +++ b/unicodetools/data/ucd/dev/DerivedCoreProperties.txt @@ -1,5 +1,5 @@ # DerivedCoreProperties-16.0.0.txt -# Date: 2024-06-06, 16:48:48 GMT +# Date: 2024-07-24, 16:28:17 GMT # © 2024 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use and license, see https://www.unicode.org/terms_of_use.html @@ -1320,7 +1320,7 @@ FFDA..FFDC ; Alphabetic # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANG 16FE0..16FE1 ; Alphabetic # Lm [2] TANGUT ITERATION MARK..NUSHU ITERATION MARK 16FE3 ; Alphabetic # Lm OLD CHINESE ITERATION MARK 16FF0..16FF1 ; Alphabetic # Mc [2] VIETNAMESE ALTERNATE READING MARK CA..VIETNAMESE ALTERNATE READING MARK NHAY -16FF2..16FF3 ; Alphabetic # Lo [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER +16FF2..16FF3 ; Alphabetic # Lm [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER 17000..187F7 ; Alphabetic # Lo [6136] TANGUT IDEOGRAPH-17000..TANGUT IDEOGRAPH-187F7 18800..18CD5 ; Alphabetic # Lo [1238] TANGUT COMPONENT-001..KHITAN SMALL SCRIPT CHARACTER-18CD5 18CFF..18D08 ; Alphabetic # Lo [10] KHITAN SMALL SCRIPT CHARACTER-18CFF..TANGUT IDEOGRAPH-18D08 @@ -3465,6 +3465,7 @@ FFF9..FFFB ; Case_Ignorable # Cf [3] INTERLINEAR ANNOTATION ANCHOR..INTERLI 16FE0..16FE1 ; Case_Ignorable # Lm [2] TANGUT ITERATION MARK..NUSHU ITERATION MARK 16FE3 ; Case_Ignorable # Lm OLD CHINESE ITERATION MARK 16FE4 ; Case_Ignorable # Mn KHITAN SMALL SCRIPT FILLER +16FF2..16FF3 ; Case_Ignorable # Lm [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER 1AFF0..1AFF3 ; Case_Ignorable # Lm [4] KATAKANA LETTER MINNAN TONE-2..KATAKANA LETTER MINNAN TONE-5 1AFF5..1AFFB ; Case_Ignorable # Lm [7] KATAKANA LETTER MINNAN TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-5 1AFFD..1AFFE ; Case_Ignorable # Lm [2] KATAKANA LETTER MINNAN NASALIZED TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-8 @@ -3506,7 +3507,7 @@ E0001 ; Case_Ignorable # Cf LANGUAGE TAG E0020..E007F ; Case_Ignorable # Cf [96] TAG SPACE..CANCEL TAG E0100..E01EF ; Case_Ignorable # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256 -# Total code points: 2749 +# Total code points: 2751 # ================================================ @@ -6853,7 +6854,7 @@ FFDA..FFDC ; ID_Start # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL 16F93..16F9F ; ID_Start # Lm [13] MIAO LETTER TONE-2..MIAO LETTER REFORMED TONE-8 16FE0..16FE1 ; ID_Start # Lm [2] TANGUT ITERATION MARK..NUSHU ITERATION MARK 16FE3 ; ID_Start # Lm OLD CHINESE ITERATION MARK -16FF2..16FF3 ; ID_Start # Lo [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER +16FF2..16FF3 ; ID_Start # Lm [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER 17000..187F7 ; ID_Start # Lo [6136] TANGUT IDEOGRAPH-17000..TANGUT IDEOGRAPH-187F7 18800..18CD5 ; ID_Start # Lo [1238] TANGUT COMPONENT-001..KHITAN SMALL SCRIPT CHARACTER-18CD5 18CFF..18D08 ; ID_Start # Lo [10] KHITAN SMALL SCRIPT CHARACTER-18CFF..TANGUT IDEOGRAPH-18D08 @@ -8224,7 +8225,7 @@ FFDA..FFDC ; ID_Continue # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HAN 16FE3 ; ID_Continue # Lm OLD CHINESE ITERATION MARK 16FE4 ; ID_Continue # Mn KHITAN SMALL SCRIPT FILLER 16FF0..16FF1 ; ID_Continue # Mc [2] VIETNAMESE ALTERNATE READING MARK CA..VIETNAMESE ALTERNATE READING MARK NHAY -16FF2..16FF3 ; ID_Continue # Lo [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER +16FF2..16FF3 ; ID_Continue # Lm [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER 17000..187F7 ; ID_Continue # Lo [6136] TANGUT IDEOGRAPH-17000..TANGUT IDEOGRAPH-187F7 18800..18CD5 ; ID_Continue # Lo [1238] TANGUT COMPONENT-001..KHITAN SMALL SCRIPT CHARACTER-18CD5 18CFF..18D08 ; ID_Continue # Lo [10] KHITAN SMALL SCRIPT CHARACTER-18CFF..TANGUT IDEOGRAPH-18D08 @@ -9041,7 +9042,7 @@ FFDA..FFDC ; XID_Start # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGU 16F93..16F9F ; XID_Start # Lm [13] MIAO LETTER TONE-2..MIAO LETTER REFORMED TONE-8 16FE0..16FE1 ; XID_Start # Lm [2] TANGUT ITERATION MARK..NUSHU ITERATION MARK 16FE3 ; XID_Start # Lm OLD CHINESE ITERATION MARK -16FF2..16FF3 ; XID_Start # Lo [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER +16FF2..16FF3 ; XID_Start # Lm [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER 17000..187F7 ; XID_Start # Lo [6136] TANGUT IDEOGRAPH-17000..TANGUT IDEOGRAPH-187F7 18800..18CD5 ; XID_Start # Lo [1238] TANGUT COMPONENT-001..KHITAN SMALL SCRIPT CHARACTER-18CD5 18CFF..18D08 ; XID_Start # Lo [10] KHITAN SMALL SCRIPT CHARACTER-18CFF..TANGUT IDEOGRAPH-18D08 @@ -10413,7 +10414,7 @@ FFDA..FFDC ; XID_Continue # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HA 16FE3 ; XID_Continue # Lm OLD CHINESE ITERATION MARK 16FE4 ; XID_Continue # Mn KHITAN SMALL SCRIPT FILLER 16FF0..16FF1 ; XID_Continue # Mc [2] VIETNAMESE ALTERNATE READING MARK CA..VIETNAMESE ALTERNATE READING MARK NHAY -16FF2..16FF3 ; XID_Continue # Lo [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER +16FF2..16FF3 ; XID_Continue # Lm [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER 17000..187F7 ; XID_Continue # Lo [6136] TANGUT IDEOGRAPH-17000..TANGUT IDEOGRAPH-187F7 18800..18CD5 ; XID_Continue # Lo [1238] TANGUT COMPONENT-001..KHITAN SMALL SCRIPT CHARACTER-18CD5 18CFF..18D08 ; XID_Continue # Lo [10] KHITAN SMALL SCRIPT CHARACTER-18CFF..TANGUT IDEOGRAPH-18D08 @@ -12611,7 +12612,7 @@ FFFC..FFFD ; Grapheme_Base # So [2] OBJECT REPLACEMENT CHARACTER..REPLACEME 16FE0..16FE1 ; Grapheme_Base # Lm [2] TANGUT ITERATION MARK..NUSHU ITERATION MARK 16FE2 ; Grapheme_Base # Po OLD CHINESE HOOK MARK 16FE3 ; Grapheme_Base # Lm OLD CHINESE ITERATION MARK -16FF2..16FF3 ; Grapheme_Base # Lo [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER +16FF2..16FF3 ; Grapheme_Base # Lm [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER 17000..187F7 ; Grapheme_Base # Lo [6136] TANGUT IDEOGRAPH-17000..TANGUT IDEOGRAPH-187F7 18800..18CD5 ; Grapheme_Base # Lo [1238] TANGUT COMPONENT-001..KHITAN SMALL SCRIPT CHARACTER-18CD5 18CFF..18D08 ; Grapheme_Base # Lo [10] KHITAN SMALL SCRIPT CHARACTER-18CFF..TANGUT IDEOGRAPH-18D08 diff --git a/unicodetools/data/ucd/dev/EastAsianWidth.txt b/unicodetools/data/ucd/dev/EastAsianWidth.txt index 527aae5db..99447310b 100644 --- a/unicodetools/data/ucd/dev/EastAsianWidth.txt +++ b/unicodetools/data/ucd/dev/EastAsianWidth.txt @@ -1,5 +1,5 @@ # EastAsianWidth-16.0.0.txt -# Date: 2024-06-06, 16:48:53 GMT +# Date: 2024-07-24, 16:28:27 GMT # © 2024 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use and license, see https://www.unicode.org/terms_of_use.html @@ -2360,7 +2360,7 @@ FFFD ; A # So REPLACEMENT CHARACTER 16FE3 ; W # Lm OLD CHINESE ITERATION MARK 16FE4 ; W # Mn KHITAN SMALL SCRIPT FILLER 16FF0..16FF1 ; W # Mc [2] VIETNAMESE ALTERNATE READING MARK CA..VIETNAMESE ALTERNATE READING MARK NHAY -16FF2..16FF3 ; N # Lo [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER +16FF2..16FF3 ; N # Lm [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER 17000..187F7 ; W # Lo [6136] TANGUT IDEOGRAPH-17000..TANGUT IDEOGRAPH-187F7 18800..18AFF ; W # Lo [768] TANGUT COMPONENT-001..TANGUT COMPONENT-768 18B00..18CD5 ; W # Lo [470] KHITAN SMALL SCRIPT CHARACTER-18B00..KHITAN SMALL SCRIPT CHARACTER-18CD5 diff --git a/unicodetools/data/ucd/dev/LineBreak.txt b/unicodetools/data/ucd/dev/LineBreak.txt index 68f441a10..68db5b36c 100644 --- a/unicodetools/data/ucd/dev/LineBreak.txt +++ b/unicodetools/data/ucd/dev/LineBreak.txt @@ -1,5 +1,5 @@ # LineBreak-16.0.0.txt -# Date: 2024-06-06, 16:48:54 GMT +# Date: 2024-07-24, 16:28:29 GMT # © 2024 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use and license, see https://www.unicode.org/terms_of_use.html @@ -3275,7 +3275,7 @@ FFFD ; AI # So REPLACEMENT CHARACTER 16FE3 ; NS # Lm OLD CHINESE ITERATION MARK 16FE4 ; GL # Mn KHITAN SMALL SCRIPT FILLER 16FF0..16FF1 ; CM # Mc [2] VIETNAMESE ALTERNATE READING MARK CA..VIETNAMESE ALTERNATE READING MARK NHAY -16FF2..16FF3 ; NS # Lo [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER +16FF2..16FF3 ; NS # Lm [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER 17000..187F7 ; ID # Lo [6136] TANGUT IDEOGRAPH-17000..TANGUT IDEOGRAPH-187F7 18800..18AFF ; ID # Lo [768] TANGUT COMPONENT-001..TANGUT COMPONENT-768 18B00..18CD5 ; AL # Lo [470] KHITAN SMALL SCRIPT CHARACTER-18B00..KHITAN SMALL SCRIPT CHARACTER-18CD5 diff --git a/unicodetools/data/ucd/dev/PropList.txt b/unicodetools/data/ucd/dev/PropList.txt index ed22845c1..fc8b2d23d 100644 --- a/unicodetools/data/ucd/dev/PropList.txt +++ b/unicodetools/data/ucd/dev/PropList.txt @@ -1,5 +1,5 @@ # PropList-16.0.0.txt -# Date: 2024-06-06, 16:49:04 GMT +# Date: 2024-07-24, 16:28:51 GMT # © 2024 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use and license, see https://www.unicode.org/terms_of_use.html @@ -871,7 +871,7 @@ FB1E ; Other_Alphabetic # Mn HEBREW POINT JUDEO-SPANISH VARIKA F900..FA6D ; Ideographic # Lo [366] CJK COMPATIBILITY IDEOGRAPH-F900..CJK COMPATIBILITY IDEOGRAPH-FA6D FA70..FAD9 ; Ideographic # Lo [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COMPATIBILITY IDEOGRAPH-FAD9 16FE4 ; Ideographic # Mn KHITAN SMALL SCRIPT FILLER -16FF2..16FF3 ; Ideographic # Lo [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER +16FF2..16FF3 ; Ideographic # Lm [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER 17000..187F7 ; Ideographic # Lo [6136] TANGUT IDEOGRAPH-17000..TANGUT IDEOGRAPH-187F7 18800..18CD5 ; Ideographic # Lo [1238] TANGUT COMPONENT-001..KHITAN SMALL SCRIPT CHARACTER-18CD5 18CFF..18D08 ; Ideographic # Lo [10] KHITAN SMALL SCRIPT CHARACTER-18CFF..TANGUT IDEOGRAPH-18D08 @@ -1194,7 +1194,7 @@ FF70 ; Extender # Lm HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND 16B42..16B43 ; Extender # Lm [2] PAHAWH HMONG SIGN VOS NRUA..PAHAWH HMONG SIGN IB YAM 16FE0..16FE1 ; Extender # Lm [2] TANGUT ITERATION MARK..NUSHU ITERATION MARK 16FE3 ; Extender # Lm OLD CHINESE ITERATION MARK -16FF2..16FF3 ; Extender # Lo [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER +16FF2..16FF3 ; Extender # Lm [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER 1E13C..1E13D ; Extender # Lm [2] NYIAKENG PUACHUE HMONG SIGN XW XW..NYIAKENG PUACHUE HMONG SYLLABLE LENGTHENER 1E5EF ; Extender # Mn OL ONAL SIGN IKIR 1E944..1E946 ; Extender # Mn [3] ADLAM ALIF LENGTHENER..ADLAM GEMINATION MARK diff --git a/unicodetools/data/ucd/dev/Scripts.txt b/unicodetools/data/ucd/dev/Scripts.txt index ea0411062..20498e665 100644 --- a/unicodetools/data/ucd/dev/Scripts.txt +++ b/unicodetools/data/ucd/dev/Scripts.txt @@ -1,5 +1,5 @@ # Scripts-16.0.0.txt -# Date: 2024-06-06, 16:49:19 GMT +# Date: 2024-07-24, 16:29:26 GMT # © 2024 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use and license, see https://www.unicode.org/terms_of_use.html @@ -1594,7 +1594,7 @@ FA70..FAD9 ; Han # Lo [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COMPATIBILI 16FE2 ; Han # Po OLD CHINESE HOOK MARK 16FE3 ; Han # Lm OLD CHINESE ITERATION MARK 16FF0..16FF1 ; Han # Mc [2] VIETNAMESE ALTERNATE READING MARK CA..VIETNAMESE ALTERNATE READING MARK NHAY -16FF2..16FF3 ; Han # Lo [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER +16FF2..16FF3 ; Han # Lm [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER 20000..2A6DF ; Han # Lo [42720] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DF 2A700..2B739 ; Han # Lo [4154] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B739 2B740..2B81D ; Han # Lo [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D diff --git a/unicodetools/data/ucd/dev/VerticalOrientation.txt b/unicodetools/data/ucd/dev/VerticalOrientation.txt index 68ba53480..017ef0329 100644 --- a/unicodetools/data/ucd/dev/VerticalOrientation.txt +++ b/unicodetools/data/ucd/dev/VerticalOrientation.txt @@ -1,5 +1,5 @@ # VerticalOrientation-16.0.0.txt -# Date: 2024-06-06, 16:49:21 GMT +# Date: 2024-07-24, 16:29:33 GMT # © 2024 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use and license, see https://www.unicode.org/terms_of_use.html @@ -2188,7 +2188,7 @@ FFFC..FFFD ; U # So [2] OBJECT REPLACEMENT CHARACTER..REPLACEMENT CHARA 16FE4 ; U # Mn KHITAN SMALL SCRIPT FILLER 16FE5..16FEF ; U # Cn [11] .. 16FF0..16FF1 ; U # Mc [2] VIETNAMESE ALTERNATE READING MARK CA..VIETNAMESE ALTERNATE READING MARK NHAY -16FF2..16FF3 ; U # Lo [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER +16FF2..16FF3 ; U # Lm [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER 16FF4..16FFF ; U # Cn [12] .. 17000..187F7 ; U # Lo [6136] TANGUT IDEOGRAPH-17000..TANGUT IDEOGRAPH-187F7 187F8..187FF ; U # Cn [8] .. diff --git a/unicodetools/data/ucd/dev/auxiliary/SentenceBreakProperty.txt b/unicodetools/data/ucd/dev/auxiliary/SentenceBreakProperty.txt index 2353266e6..462041bf4 100644 --- a/unicodetools/data/ucd/dev/auxiliary/SentenceBreakProperty.txt +++ b/unicodetools/data/ucd/dev/auxiliary/SentenceBreakProperty.txt @@ -1,5 +1,5 @@ # SentenceBreakProperty-16.0.0.txt -# Date: 2024-06-06, 16:49:19 GMT +# Date: 2024-07-24, 16:29:28 GMT # © 2024 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use and license, see https://www.unicode.org/terms_of_use.html @@ -2510,7 +2510,7 @@ FFDA..FFDC ; OLetter # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL 16F93..16F9F ; OLetter # Lm [13] MIAO LETTER TONE-2..MIAO LETTER REFORMED TONE-8 16FE0..16FE1 ; OLetter # Lm [2] TANGUT ITERATION MARK..NUSHU ITERATION MARK 16FE3 ; OLetter # Lm OLD CHINESE ITERATION MARK -16FF2..16FF3 ; OLetter # Lo [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER +16FF2..16FF3 ; OLetter # Lm [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER 17000..187F7 ; OLetter # Lo [6136] TANGUT IDEOGRAPH-17000..TANGUT IDEOGRAPH-187F7 18800..18CD5 ; OLetter # Lo [1238] TANGUT COMPONENT-001..KHITAN SMALL SCRIPT CHARACTER-18CD5 18CFF..18D08 ; OLetter # Lo [10] KHITAN SMALL SCRIPT CHARACTER-18CFF..TANGUT IDEOGRAPH-18D08 diff --git a/unicodetools/data/ucd/dev/extracted/DerivedBidiClass.txt b/unicodetools/data/ucd/dev/extracted/DerivedBidiClass.txt index 57703014b..164ac99bb 100644 --- a/unicodetools/data/ucd/dev/extracted/DerivedBidiClass.txt +++ b/unicodetools/data/ucd/dev/extracted/DerivedBidiClass.txt @@ -1,5 +1,5 @@ # DerivedBidiClass-16.0.0.txt -# Date: 2024-06-06, 16:48:46 GMT +# Date: 2024-07-24, 16:28:13 GMT # © 2024 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use and license, see https://www.unicode.org/terms_of_use.html @@ -1103,7 +1103,7 @@ FFDA..FFDC ; L # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER 16FE0..16FE1 ; L # Lm [2] TANGUT ITERATION MARK..NUSHU ITERATION MARK 16FE3 ; L # Lm OLD CHINESE ITERATION MARK 16FF0..16FF1 ; L # Mc [2] VIETNAMESE ALTERNATE READING MARK CA..VIETNAMESE ALTERNATE READING MARK NHAY -16FF2..16FF3 ; L # Lo [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER +16FF2..16FF3 ; L # Lm [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER 17000..187F7 ; L # Lo [6136] TANGUT IDEOGRAPH-17000..TANGUT IDEOGRAPH-187F7 18800..18CD5 ; L # Lo [1238] TANGUT COMPONENT-001..KHITAN SMALL SCRIPT CHARACTER-18CD5 18CFF..18D08 ; L # Lo [10] KHITAN SMALL SCRIPT CHARACTER-18CFF..TANGUT IDEOGRAPH-18D08 diff --git a/unicodetools/data/ucd/dev/extracted/DerivedCombiningClass.txt b/unicodetools/data/ucd/dev/extracted/DerivedCombiningClass.txt index 34e47f527..13b91b144 100644 --- a/unicodetools/data/ucd/dev/extracted/DerivedCombiningClass.txt +++ b/unicodetools/data/ucd/dev/extracted/DerivedCombiningClass.txt @@ -1,5 +1,5 @@ # DerivedCombiningClass-16.0.0.txt -# Date: 2024-06-06, 16:48:47 GMT +# Date: 2024-07-24, 16:28:16 GMT # © 2024 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use and license, see https://www.unicode.org/terms_of_use.html @@ -1839,7 +1839,7 @@ FFFC..FFFD ; 0 # So [2] OBJECT REPLACEMENT CHARACTER..REPLACEMENT CHARACTER 16FE2 ; 0 # Po OLD CHINESE HOOK MARK 16FE3 ; 0 # Lm OLD CHINESE ITERATION MARK 16FE4 ; 0 # Mn KHITAN SMALL SCRIPT FILLER -16FF2..16FF3 ; 0 # Lo [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER +16FF2..16FF3 ; 0 # Lm [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER 17000..187F7 ; 0 # Lo [6136] TANGUT IDEOGRAPH-17000..TANGUT IDEOGRAPH-187F7 18800..18CD5 ; 0 # Lo [1238] TANGUT COMPONENT-001..KHITAN SMALL SCRIPT CHARACTER-18CD5 18CFF..18D08 ; 0 # Lo [10] KHITAN SMALL SCRIPT CHARACTER-18CFF..TANGUT IDEOGRAPH-18D08 diff --git a/unicodetools/data/ucd/dev/extracted/DerivedEastAsianWidth.txt b/unicodetools/data/ucd/dev/extracted/DerivedEastAsianWidth.txt index 7b5371a94..67d2dd4a8 100644 --- a/unicodetools/data/ucd/dev/extracted/DerivedEastAsianWidth.txt +++ b/unicodetools/data/ucd/dev/extracted/DerivedEastAsianWidth.txt @@ -1,5 +1,5 @@ # DerivedEastAsianWidth-16.0.0.txt -# Date: 2024-06-06, 16:48:49 GMT +# Date: 2024-07-24, 16:28:20 GMT # © 2024 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use and license, see https://www.unicode.org/terms_of_use.html @@ -1874,7 +1874,7 @@ FFFC ; N # So OBJECT REPLACEMENT CHARACTER 16F51..16F87 ; N # Mc [55] MIAO SIGN ASPIRATION..MIAO VOWEL SIGN UI 16F8F..16F92 ; N # Mn [4] MIAO TONE RIGHT..MIAO TONE BELOW 16F93..16F9F ; N # Lm [13] MIAO LETTER TONE-2..MIAO LETTER REFORMED TONE-8 -16FF2..16FF3 ; N # Lo [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER +16FF2..16FF3 ; N # Lm [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER 1BC00..1BC6A ; N # Lo [107] DUPLOYAN LETTER H..DUPLOYAN LETTER VOCALIC M 1BC70..1BC7C ; N # Lo [13] DUPLOYAN AFFIX LEFT HORIZONTAL SECANT..DUPLOYAN AFFIX ATTACHED TANGENT HOOK 1BC80..1BC88 ; N # Lo [9] DUPLOYAN AFFIX HIGH ACUTE..DUPLOYAN AFFIX HIGH VERTICAL diff --git a/unicodetools/data/ucd/dev/extracted/DerivedGeneralCategory.txt b/unicodetools/data/ucd/dev/extracted/DerivedGeneralCategory.txt index 6ff46c4f2..551cb8412 100644 --- a/unicodetools/data/ucd/dev/extracted/DerivedGeneralCategory.txt +++ b/unicodetools/data/ucd/dev/extracted/DerivedGeneralCategory.txt @@ -1,5 +1,5 @@ # DerivedGeneralCategory-16.0.0.txt -# Date: 2024-06-06, 16:48:50 GMT +# Date: 2024-07-24, 16:28:21 GMT # © 2024 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use and license, see https://www.unicode.org/terms_of_use.html @@ -2165,6 +2165,7 @@ FF9E..FF9F ; Lm # [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAK 16F93..16F9F ; Lm # [13] MIAO LETTER TONE-2..MIAO LETTER REFORMED TONE-8 16FE0..16FE1 ; Lm # [2] TANGUT ITERATION MARK..NUSHU ITERATION MARK 16FE3 ; Lm # OLD CHINESE ITERATION MARK +16FF2..16FF3 ; Lm # [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER 1AFF0..1AFF3 ; Lm # [4] KATAKANA LETTER MINNAN TONE-2..KATAKANA LETTER MINNAN TONE-5 1AFF5..1AFFB ; Lm # [7] KATAKANA LETTER MINNAN TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-5 1AFFD..1AFFE ; Lm # [2] KATAKANA LETTER MINNAN NASALIZED TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-8 @@ -2173,7 +2174,7 @@ FF9E..FF9F ; Lm # [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAK 1E4EB ; Lm # NAG MUNDARI SIGN OJOD 1E94B ; Lm # ADLAM NASALIZATION MARK -# Total code points: 404 +# Total code points: 406 # ================================================ @@ -2639,7 +2640,6 @@ FFDA..FFDC ; Lo # [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I 16D43..16D6A ; Lo # [40] KIRAT RAI LETTER A..KIRAT RAI VOWEL SIGN AU 16F00..16F4A ; Lo # [75] MIAO LETTER PA..MIAO LETTER RTE 16F50 ; Lo # MIAO LETTER NASALIZATION -16FF2..16FF3 ; Lo # [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER 17000..187F7 ; Lo # [6136] TANGUT IDEOGRAPH-17000..TANGUT IDEOGRAPH-187F7 18800..18CD5 ; Lo # [1238] TANGUT COMPONENT-001..KHITAN SMALL SCRIPT CHARACTER-18CD5 18CFF..18D08 ; Lo # [10] KHITAN SMALL SCRIPT CHARACTER-18CFF..TANGUT IDEOGRAPH-18D08 @@ -2709,7 +2709,7 @@ FFDA..FFDC ; Lo # [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I 30000..3134A ; Lo # [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A 31350..323AF ; Lo # [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF -# Total code points: 136479 +# Total code points: 136477 # ================================================ diff --git a/unicodetools/data/ucd/dev/extracted/DerivedLineBreak.txt b/unicodetools/data/ucd/dev/extracted/DerivedLineBreak.txt index 31a721290..761b100e2 100644 --- a/unicodetools/data/ucd/dev/extracted/DerivedLineBreak.txt +++ b/unicodetools/data/ucd/dev/extracted/DerivedLineBreak.txt @@ -1,5 +1,5 @@ # DerivedLineBreak-16.0.0.txt -# Date: 2024-06-06, 16:48:51 GMT +# Date: 2024-07-24, 16:28:23 GMT # © 2024 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use and license, see https://www.unicode.org/terms_of_use.html @@ -356,7 +356,7 @@ FF9E..FF9F ; NS # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KA 16FE0..16FE1 ; NS # Lm [2] TANGUT ITERATION MARK..NUSHU ITERATION MARK 16FE2 ; NS # Po OLD CHINESE HOOK MARK 16FE3 ; NS # Lm OLD CHINESE ITERATION MARK -16FF2..16FF3 ; NS # Lo [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER +16FF2..16FF3 ; NS # Lm [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER 1F679..1F67B ; NS # So [3] HEAVY INTERROBANG ORNAMENT..HEAVY SANS-SERIF INTERROBANG ORNAMENT # Total code points: 37 From 919c5a8ff66a13334eaf39bfc76298a459e236a8 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 13 Nov 2024 21:45:47 +0100 Subject: [PATCH 10/10] Not lowercase --- .../resources/org/unicode/text/UCD/UnicodeInvariantTest.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt index f9ba87ddd..608645929 100644 --- a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt +++ b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt @@ -721,7 +721,7 @@ Let $nonLowercaseSmallLetters := [ \N{TURNED GREEK SMALL LETTER IOTA} \p{name=/^(SQUARED|PARENTHESIZED|TAG) LATIN SMALL LETTER/} ] -Let $nonLowercaseSmallModifierLetters := [ \p{gc=Lm} & \p{name=/^ARABIC SMALL/} ] +Let $nonLowercaseSmallModifierLetters := [ \p{gc=Lm} & \p{name=/^(ARABIC|CHINESE) SMALL/} ] [ \p{name=/\bSMALL LETTER\b/}-\p{gc=Mn}-\p{gc=Lt} - $nonLowercaseSmallLetters ] ⊆ \p{Lowercase} [ [\p{gc=Lm} & \p{name=/SMALL/}] - $nonLowercaseSmallModifierLetters ] ⊆ \p{Lowercase}