diff --git a/docs/help/changes.md b/docs/help/changes.md index 32a91caa6..c4cf6cafe 100644 --- a/docs/help/changes.md +++ b/docs/help/changes.md @@ -3,42 +3,38 @@ The Unicode Utilities have been modified to support both properties from the released version of Unicode (via ICU) and from the new Unicode beta. -To get the beta version of the property, insert β *after* the property name. +To get the beta version of the property, insert `Uβ:` *before* the property name. +The explicit version number for the β can be used; +the resulting property is then only valid when that specific β is current. Examples: -| `\p{Word_Break=ALetter}` | Released version of Unicode | -| `\p{Word_Breakβ=ALetter}` | Beta version of Unicode | +| Query | Result | +|---|---| +| `\p{Word_Break=ALetter}` | Released version of Unicode. | +| `\p{Uβ:Word_Break=ALetter}` | Beta version of Unicode; error outside of beta review. | +| `\p{U16β:Word_Break=ALetter}` | Beta version of Unicode 16.0; error during the beta review of any other version. | For example, to see additions to that property value in the beta version, use:
-[`\p{Word_Breakβ=ALetter}-\\p{Word_Break=ALetter}`](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BWord_Break%CE%B2%3DALetter%7D-%5Cp%7BWord_Break%3DALetter%7D&g=&i=) +[`\p{Uβ:Word_Break=ALetter}-\p{Word_Break=ALetter}`](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BU%CE%B2%3AWord_Break%3DALetter%7D-%5Cp%7BWord_Break%3DALetter%7D&g=&i=)
## Caveats -The support is not complete done, and there are some known problems. - -1. Some properties are not supported in beta versions. See - - for the list. -2. When characters are listed, the new blocks and subheads don't show up. -3. If you use a property that has a β version but no ICU version, you get no - error: just an empty listing. -4. The beta properties don't yet have the "shorthands" for cases like \\p{Lu}. - So make sure the property is listed, eg \\p{gcβ=Lu} - 1. Example: - [`\p{gcβ=Lu}-\\p{gc=Lu}`](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7Bgc%CE%B2%3DLu%7D-%5Cp%7Bgc%3DLu%7D&g=&i=) -5. Tools for segmentation, etc. use the release properties; there isn't a way +The support is not completely done, and there are some known problems. + +1. The General_Category groupings such as \\p{Uβ:L} are not correctly implemented. + Only actual values, such as \\p{Uβ:Lu} etc., work. +2. Tools for segmentation, etc. use the release properties; there isn't a way to have them use the beta properties. -6. There are probably others... +3. There are probably others... If you find a problem, please file a ticket at -: make sure to start the summary with -"Unicode Utilities: " +https://github.com/unicode-org/unicodetools/issues. [Back to Unicode Utilities Help Home](index) \ No newline at end of file diff --git a/unicodetools/data/ucd/dev/DerivedAge.txt b/unicodetools/data/ucd/dev/DerivedAge.txt index f6e645639..1dd05365d 100644 --- a/unicodetools/data/ucd/dev/DerivedAge.txt +++ b/unicodetools/data/ucd/dev/DerivedAge.txt @@ -1,5 +1,5 @@ # DerivedAge-16.0.0.txt -# Date: 2024-06-06, 10:07:23 GMT +# Date: 2024-06-07, 16:34:38 GMT # © 2024 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use and license, see https://www.unicode.org/terms_of_use.html diff --git a/unicodetools/data/ucd/dev/DerivedCoreProperties.txt b/unicodetools/data/ucd/dev/DerivedCoreProperties.txt index c6bac003e..f837cb8fa 100644 --- a/unicodetools/data/ucd/dev/DerivedCoreProperties.txt +++ b/unicodetools/data/ucd/dev/DerivedCoreProperties.txt @@ -1,5 +1,5 @@ # DerivedCoreProperties-16.0.0.txt -# Date: 2024-06-06, 10:07:42 GMT +# Date: 2024-06-07, 16:34:58 GMT # © 2024 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use and license, see https://www.unicode.org/terms_of_use.html diff --git a/unicodetools/data/ucd/dev/PropertyAliases.txt b/unicodetools/data/ucd/dev/PropertyAliases.txt index addbb0253..69dbb8d2e 100644 --- a/unicodetools/data/ucd/dev/PropertyAliases.txt +++ b/unicodetools/data/ucd/dev/PropertyAliases.txt @@ -1,5 +1,5 @@ # PropertyAliases-16.0.0.txt -# Date: 2024-04-30, 21:48:30 GMT +# Date: 2024-06-06, 21:52:48 GMT # © 2024 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use and license, see https://www.unicode.org/terms_of_use.html @@ -99,6 +99,11 @@ cjkIRG_VSource ; kIRG_VSource cjkRSUnicode ; kRSUnicode ; Unicode_Radical_Stroke; URS isc ; ISO_Comment JSN ; Jamo_Short_Name +kEH_Cat ; kEH_Cat +kEH_Desc ; kEH_Desc +kEH_HG ; kEH_HG +kEH_IFAO ; kEH_IFAO +kEH_JSesh ; kEH_JSesh na ; Name na1 ; Unicode_1_Name Name_Alias ; Name_Alias @@ -179,6 +184,8 @@ IDSB ; IDS_Binary_Operator IDST ; IDS_Trinary_Operator IDSU ; IDS_Unary_Operator Join_C ; Join_Control +kEH_NoMirror ; kEH_NoMirror +kEH_NoRotate ; kEH_NoRotate LOE ; Logical_Order_Exception Lower ; Lowercase Math ; Math @@ -213,6 +220,6 @@ XO_NFKC ; Expands_On_NFKC XO_NFKD ; Expands_On_NFKD # ================================================ -# Total: 135 +# Total: 142 # EOF diff --git a/unicodetools/data/ucd/dev/PropertyValueAliases.txt b/unicodetools/data/ucd/dev/PropertyValueAliases.txt index 8b62a2a42..8ca25c640 100644 --- a/unicodetools/data/ucd/dev/PropertyValueAliases.txt +++ b/unicodetools/data/ucd/dev/PropertyValueAliases.txt @@ -1,5 +1,5 @@ # PropertyValueAliases-16.0.0.txt -# Date: 2024-06-06, 10:08:00 GMT +# Date: 2024-06-07, 16:35:15 GMT # © 2024 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use and license, see https://www.unicode.org/terms_of_use.html @@ -1676,4 +1676,34 @@ XIDS; Y ; Yes ; T # @missing: 0000..10FFFF; cjkRSUnicode; +# kEH_Cat (kEH_Cat) + +# @missing: 0000..10FFFF; kEH_Cat; + +# kEH_Desc (kEH_Desc) + +# @missing: 0000..10FFFF; kEH_Desc; + +# kEH_HG (kEH_HG) + +# @missing: 0000..10FFFF; kEH_HG; + +# kEH_IFAO (kEH_IFAO) + +# @missing: 0000..10FFFF; kEH_IFAO; + +# kEH_JSesh (kEH_JSesh) + +# @missing: 0000..10FFFF; kEH_JSesh; + +# kEH_NoMirror (kEH_NoMirror) + +kEH_NoMirror; N ; No ; F ; False +kEH_NoMirror; Y ; Yes ; T ; True + +# kEH_NoRotate (kEH_NoRotate) + +kEH_NoRotate; N ; No ; F ; False +kEH_NoRotate; Y ; Yes ; T ; True + # EOF diff --git a/unicodetools/data/ucd/dev/auxiliary/GraphemeBreakProperty.txt b/unicodetools/data/ucd/dev/auxiliary/GraphemeBreakProperty.txt index c68f5c8fb..ca5eff1c3 100644 --- a/unicodetools/data/ucd/dev/auxiliary/GraphemeBreakProperty.txt +++ b/unicodetools/data/ucd/dev/auxiliary/GraphemeBreakProperty.txt @@ -1,5 +1,5 @@ # GraphemeBreakProperty-16.0.0.txt -# Date: 2024-06-06, 10:07:48 GMT +# Date: 2024-06-07, 16:35:03 GMT # © 2024 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use and license, see https://www.unicode.org/terms_of_use.html diff --git a/unicodetools/data/ucd/dev/auxiliary/SentenceBreakProperty.txt b/unicodetools/data/ucd/dev/auxiliary/SentenceBreakProperty.txt index ffbccd2e5..57a9a58a6 100644 --- a/unicodetools/data/ucd/dev/auxiliary/SentenceBreakProperty.txt +++ b/unicodetools/data/ucd/dev/auxiliary/SentenceBreakProperty.txt @@ -1,5 +1,5 @@ # SentenceBreakProperty-16.0.0.txt -# Date: 2024-06-06, 10:08:13 GMT +# Date: 2024-06-07, 16:35:29 GMT # © 2024 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use and license, see https://www.unicode.org/terms_of_use.html diff --git a/unicodetools/data/ucd/dev/auxiliary/WordBreakProperty.txt b/unicodetools/data/ucd/dev/auxiliary/WordBreakProperty.txt index 6444a8e65..a961840b1 100644 --- a/unicodetools/data/ucd/dev/auxiliary/WordBreakProperty.txt +++ b/unicodetools/data/ucd/dev/auxiliary/WordBreakProperty.txt @@ -1,5 +1,5 @@ # WordBreakProperty-16.0.0.txt -# Date: 2024-06-06, 10:08:15 GMT +# Date: 2024-06-07, 16:35:31 GMT # © 2024 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use and license, see https://www.unicode.org/terms_of_use.html diff --git a/unicodetools/data/ucd/dev/extracted/DerivedBidiClass.txt b/unicodetools/data/ucd/dev/extracted/DerivedBidiClass.txt index 34ef68d83..82b7b5e93 100644 --- a/unicodetools/data/ucd/dev/extracted/DerivedBidiClass.txt +++ b/unicodetools/data/ucd/dev/extracted/DerivedBidiClass.txt @@ -1,5 +1,5 @@ # DerivedBidiClass-16.0.0.txt -# Date: 2024-06-06, 10:07:40 GMT +# Date: 2024-06-07, 16:34:55 GMT # © 2024 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use and license, see https://www.unicode.org/terms_of_use.html diff --git a/unicodetools/data/ucd/dev/extracted/DerivedCombiningClass.txt b/unicodetools/data/ucd/dev/extracted/DerivedCombiningClass.txt index bf6958b9d..051ea0e7f 100644 --- a/unicodetools/data/ucd/dev/extracted/DerivedCombiningClass.txt +++ b/unicodetools/data/ucd/dev/extracted/DerivedCombiningClass.txt @@ -1,5 +1,5 @@ # DerivedCombiningClass-16.0.0.txt -# Date: 2024-06-06, 10:07:41 GMT +# Date: 2024-06-07, 16:34:57 GMT # © 2024 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use and license, see https://www.unicode.org/terms_of_use.html diff --git a/unicodetools/data/ucd/dev/extracted/DerivedEastAsianWidth.txt b/unicodetools/data/ucd/dev/extracted/DerivedEastAsianWidth.txt index 1f4593cc5..052ecaf0c 100644 --- a/unicodetools/data/ucd/dev/extracted/DerivedEastAsianWidth.txt +++ b/unicodetools/data/ucd/dev/extracted/DerivedEastAsianWidth.txt @@ -1,5 +1,5 @@ # DerivedEastAsianWidth-16.0.0.txt -# Date: 2024-06-06, 10:07:44 GMT +# Date: 2024-06-07, 16:34:59 GMT # © 2024 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use and license, see https://www.unicode.org/terms_of_use.html diff --git a/unicodetools/data/ucd/dev/extracted/DerivedGeneralCategory.txt b/unicodetools/data/ucd/dev/extracted/DerivedGeneralCategory.txt index 0485f9373..cf87aa6d1 100644 --- a/unicodetools/data/ucd/dev/extracted/DerivedGeneralCategory.txt +++ b/unicodetools/data/ucd/dev/extracted/DerivedGeneralCategory.txt @@ -1,5 +1,5 @@ # DerivedGeneralCategory-16.0.0.txt -# Date: 2024-06-06, 10:07:44 GMT +# Date: 2024-06-07, 16:34:59 GMT # © 2024 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use and license, see https://www.unicode.org/terms_of_use.html diff --git a/unicodetools/data/ucd/dev/extracted/DerivedJoiningType.txt b/unicodetools/data/ucd/dev/extracted/DerivedJoiningType.txt index 0c3a8afaf..cc5888b2e 100644 --- a/unicodetools/data/ucd/dev/extracted/DerivedJoiningType.txt +++ b/unicodetools/data/ucd/dev/extracted/DerivedJoiningType.txt @@ -1,5 +1,5 @@ # DerivedJoiningType-16.0.0.txt -# Date: 2024-06-06, 10:07:45 GMT +# Date: 2024-06-07, 16:35:00 GMT # © 2024 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use and license, see https://www.unicode.org/terms_of_use.html diff --git a/unicodetools/data/ucd/dev/extracted/DerivedLineBreak.txt b/unicodetools/data/ucd/dev/extracted/DerivedLineBreak.txt index 0e22de905..95408aa24 100644 --- a/unicodetools/data/ucd/dev/extracted/DerivedLineBreak.txt +++ b/unicodetools/data/ucd/dev/extracted/DerivedLineBreak.txt @@ -1,5 +1,5 @@ # DerivedLineBreak-16.0.0.txt -# Date: 2024-06-06, 10:07:45 GMT +# Date: 2024-06-07, 16:35:01 GMT # © 2024 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use and license, see https://www.unicode.org/terms_of_use.html diff --git a/unicodetools/data/ucd/dev/extracted/DerivedName.txt b/unicodetools/data/ucd/dev/extracted/DerivedName.txt index ee3be48a3..b7e94719c 100644 --- a/unicodetools/data/ucd/dev/extracted/DerivedName.txt +++ b/unicodetools/data/ucd/dev/extracted/DerivedName.txt @@ -1,5 +1,5 @@ # DerivedName-16.0.0.txt -# Date: 2024-06-06, 10:07:45 GMT +# Date: 2024-06-07, 16:35:01 GMT # © 2024 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use and license, see https://www.unicode.org/terms_of_use.html diff --git a/unicodetools/src/main/java/org/unicode/props/PropertyParsingInfo.java b/unicodetools/src/main/java/org/unicode/props/PropertyParsingInfo.java index 68ff0d963..6c794380e 100644 --- a/unicodetools/src/main/java/org/unicode/props/PropertyParsingInfo.java +++ b/unicodetools/src/main/java/org/unicode/props/PropertyParsingInfo.java @@ -516,7 +516,10 @@ static void parseSourceFile( } else { indexUnicodeProperties.getFileNames().add(fullFilename); UcdLineParser parser = new UcdLineParser(FileUtilities.in("", fullFilename)); - if (fileName.startsWith("Unihan") || fileName.startsWith("k")) { + if (fileName.startsWith("Unihan") + || fileName.startsWith("Unikemet") + || (fileName.endsWith("Sources") && !fileName.startsWith("Emoji")) + || fileName.startsWith("k")) { parser.withTabs(true); } PropertyParsingInfo propInfo; diff --git a/unicodetools/src/main/java/org/unicode/props/PropertyStatus.java b/unicodetools/src/main/java/org/unicode/props/PropertyStatus.java index da2e3c175..3bd97f746 100644 --- a/unicodetools/src/main/java/org/unicode/props/PropertyStatus.java +++ b/unicodetools/src/main/java/org/unicode/props/PropertyStatus.java @@ -131,6 +131,7 @@ public enum PropertyScope { UcdProperty.Emoji_KDDI, UcdProperty.Emoji_SB); + // TODO(egg): These lists are not up to date! private static final EnumSet CONTRIBUTORY_PROPERTY = EnumSet.of( UcdProperty.Jamo_Short_Name, @@ -230,7 +231,10 @@ public enum PropertyScope { UcdProperty.Named_Sequences_Prov, UcdProperty.Regional_Indicator, UcdProperty.Standardized_Variant, - UcdProperty.Vertical_Orientation); + UcdProperty.Vertical_Orientation, + // Unikemet + UcdProperty.kEH_Cat, + UcdProperty.kEH_Desc); private static final EnumSet NORMATIVE_PROPERTY = EnumSet.of( @@ -290,7 +294,13 @@ public enum PropertyScope { UcdProperty.kIRG_MSource, UcdProperty.kIRG_TSource, UcdProperty.kIRG_USource, - UcdProperty.kIRG_VSource); + UcdProperty.kIRG_VSource, + // Unikemet + UcdProperty.kEH_HG, + UcdProperty.kEH_IFAO, + UcdProperty.kEH_JSesh, + UcdProperty.kEH_NoMirror, + UcdProperty.kEH_NoRotate); private static final EnumSet IMMUTABLE_PROPERTY = EnumSet.of( UcdProperty.Name, diff --git a/unicodetools/src/main/java/org/unicode/props/UcdProperty.java b/unicodetools/src/main/java/org/unicode/props/UcdProperty.java index 0e18f8867..fd9e5b7a3 100644 --- a/unicodetools/src/main/java/org/unicode/props/UcdProperty.java +++ b/unicodetools/src/main/java/org/unicode/props/UcdProperty.java @@ -105,6 +105,14 @@ public enum UcdProperty { kDaeJaweon(PropertyType.Miscellaneous, "cjkDaeJaweon"), kDefinition(PropertyType.Miscellaneous, "cjkDefinition"), kEACC(PropertyType.Miscellaneous, "cjkEACC"), + kEH_Cat(PropertyType.Miscellaneous, "kEH_Cat"), + kEH_Desc(PropertyType.Miscellaneous, "kEH_Desc"), + kEH_FVal(PropertyType.Miscellaneous, "kEH_FVal"), + kEH_Func(PropertyType.Miscellaneous, "kEH_Func"), + kEH_HG(PropertyType.Miscellaneous, "kEH_HG"), + kEH_IFAO(PropertyType.Miscellaneous, "kEH_IFAO"), + kEH_JSesh(PropertyType.Miscellaneous, "kEH_JSesh"), + kEH_UniK(PropertyType.Miscellaneous, "kEH_UniK"), kFanqie(PropertyType.Miscellaneous, "cjkFanqie"), kFenn(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkFenn"), kFennIndex(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkFennIndex"), @@ -182,6 +190,7 @@ public enum UcdProperty { kRSKanWa(PropertyType.Miscellaneous, "cjkRSKanWa"), kRSKangXi(PropertyType.Miscellaneous, "cjkRSKangXi"), kRSKorean(PropertyType.Miscellaneous, "cjkRSKorean"), + kRSTUnicode(PropertyType.Miscellaneous, "kRSTUnicode"), kRSUnicode( PropertyType.Miscellaneous, null, @@ -189,6 +198,7 @@ public enum UcdProperty { "cjkRSUnicode", "Unicode_Radical_Stroke", "URS"), + kReading(PropertyType.Miscellaneous, "kReading"), kSBGY(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkSBGY"), kSMSZD2003Index(PropertyType.Miscellaneous, "cjkSMSZD2003Index"), kSMSZD2003Readings(PropertyType.Miscellaneous, "cjkSMSZD2003Readings"), @@ -200,9 +210,11 @@ public enum UcdProperty { ValueCardinality.Unordered, "cjkSpecializedSemanticVariant"), kSpoofingVariant(PropertyType.Miscellaneous, "cjkSpoofingVariant"), + kSrc_NushuDuben(PropertyType.Miscellaneous, "kSrc_NushuDuben"), kStrange(PropertyType.Miscellaneous, "cjkStrange"), kTGH(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkTGH"), kTGHZ2013(PropertyType.Miscellaneous, "cjkTGHZ2013"), + kTGT_MergedSrc(PropertyType.Miscellaneous, "kTGT_MergedSrc"), kTaiwanTelegraph(PropertyType.Miscellaneous, "cjkTaiwanTelegraph"), kTang(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkTang"), kTotalStrokes(PropertyType.Miscellaneous, null, ValueCardinality.Ordered, "cjkTotalStrokes"), @@ -341,6 +353,9 @@ public enum UcdProperty { White_Space(PropertyType.Binary, Binary.class, null, "WSpace", "space"), XID_Continue(PropertyType.Binary, Binary.class, null, "XIDC"), XID_Start(PropertyType.Binary, Binary.class, null, "XIDS"), + kEH_Core(PropertyType.Binary, Binary.class, null, "kEH_Core"), + kEH_NoMirror(PropertyType.Binary, Binary.class, null, "kEH_NoMirror"), + kEH_NoRotate(PropertyType.Binary, Binary.class, null, "kEH_NoRotate"), // Unknown ; diff --git a/unicodetools/src/main/java/org/unicode/props/UcdPropertyValues.java b/unicodetools/src/main/java/org/unicode/props/UcdPropertyValues.java index 347e442f6..94dc3f14a 100644 --- a/unicodetools/src/main/java/org/unicode/props/UcdPropertyValues.java +++ b/unicodetools/src/main/java/org/unicode/props/UcdPropertyValues.java @@ -1434,6 +1434,14 @@ public static Joining_Type_Values forName(String name) { // kDaeJaweon // kDefinition // kEACC + // kEH_Cat + // kEH_Desc + // kEH_Func + // kEH_FVal + // kEH_HG + // kEH_IFAO + // kEH_JSesh + // kEH_UniK // kFanqie // kFenn // kFennIndex @@ -1501,11 +1509,13 @@ public static Joining_Type_Values forName(String name) { // kPhonetic // kPrimaryNumeric // kPseudoGB1 + // kReading // kRSAdobe_Japan1_6 // kRSJapanese // kRSKangXi // kRSKanWa // kRSKorean + // kRSTUnicode // kRSUnicode // kSBGY // kSemanticVariant @@ -1514,11 +1524,13 @@ public static Joining_Type_Values forName(String name) { // kSMSZD2003Readings // kSpecializedSemanticVariant // kSpoofingVariant + // kSrc_NushuDuben // kStrange // kTaiwanTelegraph // kTang // kTGH // kTGHZ2013 + // kTGT_MergedSrc // kTotalStrokes // kTraditionalVariant // kUnihanCore2020 diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/ToolUnicodePropertySource.java b/unicodetools/src/main/java/org/unicode/text/UCD/ToolUnicodePropertySource.java index 582d42eec..ef3e215a6 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/ToolUnicodePropertySource.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/ToolUnicodePropertySource.java @@ -309,6 +309,13 @@ public String _getValue(int codepoint) { "cjkIRG_VSource", "cjkIRG_VSource", "kIRG_VSource"); + add(iup.getProperty("kEH_Cat")); + add(iup.getProperty("kEH_Desc")); + add(iup.getProperty("kEH_HG")); + add(iup.getProperty("kEH_IFAO")); + add(iup.getProperty("kEH_JSesh")); + add(iup.getProperty("kEH_NoMirror")); + add(iup.getProperty("kEH_NoRotate")); add(iup.getProperty("Emoji")); add(iup.getProperty("Emoji_Presentation")); add(iup.getProperty("Emoji_Modifier")); diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/VersionedProperty.java b/unicodetools/src/main/java/org/unicode/text/UCD/VersionedProperty.java index 25164d073..1aedb2410 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/VersionedProperty.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/VersionedProperty.java @@ -8,6 +8,7 @@ import java.util.Set; import java.util.TreeMap; import java.util.function.Supplier; +import java.util.stream.Collectors; import org.unicode.props.IndexUnicodeProperties; import org.unicode.props.UnicodeProperty; import org.unicode.props.UnicodeProperty.Factory; @@ -51,6 +52,7 @@ public static VersionedProperty forJSPs(Supplier oldestLoadedUcd) { result.throwOnUnknownProperty = false; result.defaultVersion = Settings.lastVersion; result.versionAliases.put("dev", Settings.latestVersion); + result.versionAliases.put(Settings.latestVersionPhase.toString(), Settings.latestVersion); result.oldestLoadedUcd = oldestLoadedUcd; for (String latest = Settings.latestVersion; ; @@ -96,8 +98,15 @@ public VersionedProperty set(String xPropertyName) { version = aliased; } else { version = names[0].substring(1); - if (versionAliases.containsValue(version)) { - throw new IllegalArgumentException("Invalid version " + version); + if (versionAliases.containsValue( + VersionInfo.getInstance(version).getVersionString(3, 3))) { + throw new IllegalArgumentException( + "Unreleased version " + + version + + "; use suffix: " + + versionAliases.keySet().stream() + .map(v -> "U" + v) + .collect(Collectors.joining(", "))); } } xPropertyName = names[1]; diff --git a/unicodetools/src/main/java/org/unicode/text/utility/Settings.java b/unicodetools/src/main/java/org/unicode/text/utility/Settings.java index 8729fbe06..d32ace265 100644 --- a/unicodetools/src/main/java/org/unicode/text/utility/Settings.java +++ b/unicodetools/src/main/java/org/unicode/text/utility/Settings.java @@ -41,7 +41,7 @@ public String toString() { } }; - public static final ReleasePhase latestVersionPhase = ReleasePhase.ALPHA; + public static final ReleasePhase latestVersionPhase = ReleasePhase.BETA; public static final String lastVersion = "15.1.0"; // last released version diff --git a/unicodetools/src/main/resources/org/unicode/props/ExtraPropertyAliases.txt b/unicodetools/src/main/resources/org/unicode/props/ExtraPropertyAliases.txt index db657d34b..80faee3c7 100644 --- a/unicodetools/src/main/resources/org/unicode/props/ExtraPropertyAliases.txt +++ b/unicodetools/src/main/resources/org/unicode/props/ExtraPropertyAliases.txt @@ -17,6 +17,10 @@ RETS ; RGI_Emoji_Tag_Sequence ; Emoji_Tag_Sequence REZS ; RGI_Emoji_Zwj_Sequence ; Emoji_Zwj_Sequence # RE ; RGI_Emoji +kEH_Core ; kEH_Core +kEH_NoMirror ; kEH_NoMirror +kEH_NoRotate ; kEH_NoRotate + # ================================================ # Enumerated Properties # ================================================ @@ -162,3 +166,13 @@ cjkVietnameseNumeric ; kVietnameseNumeric cjkZhuangNumeric ; kZhuangNumeric # 16.0 cjkFanqie ; kFanqie + +kTGT_MergedSrc ; kTGT_MergedSrc +kRSTUnicode ; kRSTUnicode + +kSrc_NushuDuben ; kSrc_NushuDuben +kReading ; kReading + +kEH_Func ; kEH_Func +kEH_FVal ; kEH_FVal +kEH_UniK ; kEH_UniK \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/props/ExtraPropertyValueAliases.txt b/unicodetools/src/main/resources/org/unicode/props/ExtraPropertyValueAliases.txt index 98613a31c..8d659c98f 100644 --- a/unicodetools/src/main/resources/org/unicode/props/ExtraPropertyValueAliases.txt +++ b/unicodetools/src/main/resources/org/unicode/props/ExtraPropertyValueAliases.txt @@ -85,6 +85,10 @@ # @missing: 0000..10FFFF; Emoji_Component ; No # @missing: 0000..10FFFF; Extended_Pictographic ; No +# @missing: 0000..10FFFF; kEH_Core ; No +# @missing: 0000..10FFFF; kEH_NoMirror ; No +# @missing: 0000..10FFFF; kEH_NoRotate ; No + # End of binary properties. # @missing: 0000..10FFFF; Canonical_Combining_Class; Not_Reordered @@ -194,3 +198,13 @@ Do_Not_Emit_Type ; Precomposed_Form ; Precomposed_Form Do_Not_Emit_Type ; Deprecated ; Deprecated Do_Not_Emit_Type ; Discouraged ; Discouraged Do_Not_Emit_Type ; Preferred_Spelling ; Preferred_Spelling + +# @missing: 0000..10FFFF; kTGT_MergedSrc ; +# @missing: 0000..10FFFF; kRSTUnicode ; + +# @missing: 0000..10FFFF; kSrc_NushuDuben ; +# @missing: 0000..10FFFF; kReading ; + +# @missing: 0000..10FFFF; kEH_Func ; +# @missing: 0000..10FFFF; kEH_FVal ; +# @missing: 0000..10FFFF; kEH_UniK ; diff --git a/unicodetools/src/main/resources/org/unicode/props/IndexUnicodeProperties.txt b/unicodetools/src/main/resources/org/unicode/props/IndexUnicodeProperties.txt index 71f5ffe73..018f9614d 100644 --- a/unicodetools/src/main/resources/org/unicode/props/IndexUnicodeProperties.txt +++ b/unicodetools/src/main/resources/org/unicode/props/IndexUnicodeProperties.txt @@ -368,3 +368,25 @@ emoji/*/emoji-sequences; RGI_Emoji_Tag_Sequence emoji/*/emoji-zwj-sequences; RGI_Emoji_Zwj_Sequence #emoji/*/emoji-test ; Emoji_Short_Name + + +FileType ; TangutSources ; PropertyValue +TangutSources ; kTGT_MergedSrc +TangutSources ; kRSTUnicode + +FileType ; NushuSources ; PropertyValue +NushuSources ; kSrc_NushuDuben +NushuSources ; kReading + +FileType ; Unikemet ; PropertyValue +Unikemet ; kEH_Cat +Unikemet ; kEH_Core +Unikemet ; kEH_Desc +Unikemet ; kEH_Func +Unikemet ; kEH_FVal +Unikemet ; kEH_JSesh +Unikemet ; kEH_HG +Unikemet ; kEH_IFAO +Unikemet ; kEH_NoMirror +Unikemet ; kEH_NoRotate +Unikemet ; kEH_UniK \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt index 3b3f3c35a..65d0004eb 100644 --- a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt +++ b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt @@ -1090,6 +1090,29 @@ Let $japaneseSimplifiedRadicals = \p{Name=/CJK RADICAL J-SIMPLIFIED/} In $chineseSimplifiedRadicals, Equivalent_Unified_Ideograph ∈ [\p{kRSUnicode=/^[0-9]+'\.0$/} $radicalsWithUnifiableSimplifications] In $japaneseSimplifiedRadicals, Equivalent_Unified_Ideograph ∈ \p{kRSUnicode=/^[0-9]+''\.0$/} +# Tangut invariants + +Let $tangutSourcesScope = [\p{Block=/^Tangut(.Supplement)?$/} - \p{gc=Cn}] +$tangutSourcesScope = [ [\p{gc=Lo} & \p{sc=Tangut}] - \p{name=/^TANGUT COMPONENT-/} ] +$tangutSourcesScope = \P{kTGT_MergedSrc=@none@} +$tangutSourcesScope = \P{kRSTUnicode=@none@} + +# Nüshu invariants + +Let $nüshuSourcesScope = [\p{Block=Nushu} - \p{gc=Cn}] +$nüshuSourcesScope = [\p{gc=Lo} & \p{sc=Nushu}] +$nüshuSourcesScope = \P{kSrc_NushuDuben=@none@} +$nüshuSourcesScope = \P{kReading=@none@} + +# Egyptian hieroglyph invariants + +Let $unikemetScope = [\p{Block=/^Egyptian.Hieroglyphs/} - \p{gc=Cn}] +$unikemetScope = [ [\p{gc=Lo} & \p{sc=Egyp}] - \p{Name=/^EGYPTIAN HIEROGLYPH (FULL |HALF |TALL |WIDE )?(BLANK|LOST SIGN)$/} ] +$unikemetScope = \P{kEH_Cat=@none@} +$unikemetScope = \P{kEH_UniK=@none@} +\p{kEH_NoMirror} ⊂ $unikemetScope +\p{kEH_NoRotate} ⊂ $unikemetScope + # InPC-InSC-gc invariants # See https://www.unicode.org/L2/L2023/23200-category-invariants.pdf. \p{InPC=/(Left|Right)/} ⊆ [\p{gc=Mc}\p{gc=Lo}\p{gc=Lm}]