Skip to content

Commit

Permalink
Initial checkin for UcdXML
Browse files Browse the repository at this point in the history
  • Loading branch information
jowilco committed Jun 6, 2024
1 parent 184d7e5 commit 2f29705
Show file tree
Hide file tree
Showing 5 changed files with 49 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -516,7 +516,8 @@ static void parseSourceFile(
} else {
indexUnicodeProperties.getFileNames().add(fullFilename);
UcdLineParser parser = new UcdLineParser(FileUtilities.in("", fullFilename));
if (fileName.startsWith("Unihan") || fileName.startsWith("k")) {
if (fileName.startsWith("Unihan") || fileName.startsWith("k")
|| fileName.startsWith("NushuSources") || fileName.startsWith("TangutSources")) {
parser.withTabs(true);
}
PropertyParsingInfo propInfo;
Expand Down
9 changes: 9 additions & 0 deletions unicodetools/src/main/java/org/unicode/props/UcdProperty.java
Original file line number Diff line number Diff line change
Expand Up @@ -84,12 +84,16 @@ public enum UcdProperty {
Emoji_SB(PropertyType.Miscellaneous, "ESB"),
ISO_Comment(PropertyType.Miscellaneous, "isc"),
Jamo_Short_Name(PropertyType.Miscellaneous, "JSN"),
NC_Corrected(PropertyType.Miscellaneous, "ncCorrected"),
NC_Original(PropertyType.Miscellaneous, "ncOriginal"),
NC_Version(PropertyType.Miscellaneous, "ncVersion"),
Name(PropertyType.Miscellaneous, "na"),
Name_Alias(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "Name_Alias"),
Named_Sequences(PropertyType.Miscellaneous, "NS"),
Named_Sequences_Prov(PropertyType.Miscellaneous, "NSP"),
Standardized_Variant(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "SV"),
Unicode_1_Name(PropertyType.Miscellaneous, "na1"),
emoji_variation_sequence(PropertyType.Miscellaneous, "EVS"),
kAlternateTotalStrokes(PropertyType.Miscellaneous, "cjkAlternateTotalStrokes"),
kBigFive(PropertyType.Miscellaneous, "cjkBigFive"),
kCCCII(PropertyType.Miscellaneous, "cjkCCCII"),
Expand Down Expand Up @@ -182,13 +186,15 @@ public enum UcdProperty {
kRSKanWa(PropertyType.Miscellaneous, "cjkRSKanWa"),
kRSKangXi(PropertyType.Miscellaneous, "cjkRSKangXi"),
kRSKorean(PropertyType.Miscellaneous, "cjkRSKorean"),
kRSTUnicode(PropertyType.Miscellaneous, "cjkRSTUnicode"),
kRSUnicode(
PropertyType.Miscellaneous,
null,
ValueCardinality.Ordered,
"cjkRSUnicode",
"Unicode_Radical_Stroke",
"URS"),
kReading(PropertyType.Miscellaneous, "cjkReading"),
kSBGY(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkSBGY"),
kSMSZD2003Index(PropertyType.Miscellaneous, "cjkSMSZD2003Index"),
kSMSZD2003Readings(PropertyType.Miscellaneous, "cjkSMSZD2003Readings"),
Expand All @@ -200,9 +206,11 @@ public enum UcdProperty {
ValueCardinality.Unordered,
"cjkSpecializedSemanticVariant"),
kSpoofingVariant(PropertyType.Miscellaneous, "cjkSpoofingVariant"),
kSrc_NushuDuben(PropertyType.Miscellaneous, "cjkSrc_NushuDuben"),
kStrange(PropertyType.Miscellaneous, "cjkStrange"),
kTGH(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkTGH"),
kTGHZ2013(PropertyType.Miscellaneous, "cjkTGHZ2013"),
kTGT_MergedSrc(PropertyType.Miscellaneous, "cjkTGT_MergedSrc"),
kTaiwanTelegraph(PropertyType.Miscellaneous, "cjkTaiwanTelegraph"),
kTang(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkTang"),
kTotalStrokes(PropertyType.Miscellaneous, null, ValueCardinality.Ordered, "cjkTotalStrokes"),
Expand All @@ -212,6 +220,7 @@ public enum UcdProperty {
kXHC1983(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkXHC1983"),
kXerox(PropertyType.Miscellaneous, "cjkXerox"),
kZVariant(PropertyType.Miscellaneous, "cjkZVariant"),
kZhuang(PropertyType.Miscellaneous, "cjkZhuang"),
kZhuangNumeric(PropertyType.Miscellaneous, "cjkZhuangNumeric"),

// Catalog
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ CJKR ; CJK_Radical
EDCM ; Emoji_DCM
EKDDI ; Emoji_KDDI
ESB ; Emoji_SB
EVS ; emoji_variation_sequence
NS ; Named_Sequences
NSP ; Named_Sequences_Prov
SV ; Standardized_Variant
Expand Down Expand Up @@ -145,6 +146,13 @@ cjkJoyoKanji ; kJoyoKanji
cjkKoreanEducationHanja ; kKoreanEducationHanja
cjkKoreanName ; kKoreanName
cjkTGH ; kTGH
cjkRSTUnicode ; kRSTUnicode
cjkReading ; kReading
cjkSrc_NushuDuben ; kSrc_NushuDuben
cjkTGT_MergedSrc ; kTGT_MergedSrc
ncCorrected ; NC_Corrected
ncOriginal ; NC_Original
ncVersion ; NC_Version
# 13.0
cjkSpoofingVariant ; kSpoofingVariant
cjkTGHZ2013 ; kTGHZ2013
Expand All @@ -162,3 +170,4 @@ cjkVietnameseNumeric ; kVietnameseNumeric
cjkZhuangNumeric ; kZhuangNumeric
# 16.0
cjkFanqie ; kFanqie
cjkZhuang ; kZhuang
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ kHanYu ; MULTI_VALUED ; [1-8][0-9]{4}\.[0-3
kIRGHanyuDaZidian ; SINGLE_VALUED ; [1-8][0-9]{4}\.[0-3][0-9][01]
kCNS1992 ; SINGLE_VALUED ; [1-9]-[0-9A-F]{4}
kTotalStrokes ; ORDERED ; [1-9][0-9]{0,2}
kRSUnicode ; ORDERED ; [1-9][0-9]{0,2}\'?\.[0-9]{1,2}
kRSUnicode ; ORDERED ; [1-9][0-9]{0,2}\'?\.[0-9]{1,2}
kRSJapanese ; EXTENSIBLE ; [1-9][0-9]{0,2}\.[0-9]{1,2}
kRSKanWa ; EXTENSIBLE ; [1-9][0-9]{0,2}\.[0-9]{1,2}
kRSKangXi ; EXTENSIBLE ; [1-9][0-9]{0,2}\.[0-9]{1,2}
Expand Down Expand Up @@ -170,16 +170,25 @@ kHanyuPinlu ; MULTI_VALUED ; [a-z\x{308}]+[1-5]\
kCantonese ; MULTI_VALUED ; [a-z]{1,6}[1-6]
kTang ; MULTI_VALUED ; \*?[A-Za-z()\x{E6}\x{251}\x{259}\x{25B}\x{300}\x{30C}]+

kJinmeiyoKanji ; MULTI_VALUED ; (20[0-9]{2})(:U\+2?[0-9A-F]{4})?
kJoyoKanji ; MULTI_VALUED ; (20[0-9]{2})|(U\+2?[0-9A-F]{4})
kJinmeiyoKanji ; MULTI_VALUED ; (20[0-9]{2})(:U\+2?[0-9A-F]{4})?
kJoyoKanji ; MULTI_VALUED ; (20[0-9]{2})|(U\+2?[0-9A-F]{4})
kKoreanEducationHanja ; MULTI_VALUED ; 20[0-9]{2}
kKoreanName ; MULTI_VALUED ; (20[0-9]{2})(:U\+2?[0-9A-F]{4})*
kTGH ; MULTI_VALUED ; 20[0-9]{2}:[1-9][0-9]{0,3}
kKoreanName ; MULTI_VALUED ; (20[0-9]{2})(:U\+2?[0-9A-F]{4})*
kTGH ; MULTI_VALUED ; 20[0-9]{2}:[1-9][0-9]{0,3}


kIRG_UKSource ; SINGLE_VALUED ; V[0-4]-[0-9A-F]{4}
kIRG_UKSource ; SINGLE_VALUED ; V[0-4]-[0-9A-F]{4}
kIRG_SSource ; SINGLE_VALUED ; V[0-4]-[0-9A-F]{4}

kSrc_NushuDuben ; SINGLE_VALUED ; [0-9]+\.[0-9]+
kReading ; SINGLE_VALUED ; [a-z]{1,6}[1-6]+
kRSTUnicode ; SINGLE_VALUED ; [0-9]+\.[0-9]+
kTGT_MergedSrc ; SINGLE_VALUED ; L2008-[0-9A-F]{4,5}(-[0-9]{4,5})?

NC_Original ; SINGLE_VALUED ; [0-9A-F]{4,5}
NC_Corrected ; SINGLE_VALUED ; [0-9A-F]{4,5}
NC_Version ; SINGLE_VALUED ; [0-9]\.[0-9]\.[0-9]


# =============================
# Catalog/Enum/Binary Properties
Expand All @@ -204,5 +213,5 @@ Confusable_MA ; SINGLE_VALUED ; $codePoints
#Emoji ; SINGLE_VALUED ; <enum>
#Emoji_Presentation ; SINGLE_VALUED ; <enum>
#Emoji_Modifier ; SINGLE_VALUED ; <enum>
#Emoji_Modifier_Base ; SINGLE_VALUED ; <enum>
#Emoji_Modifier_Base ; SINGLE_VALUED ; <enum>

Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,16 @@ FileType ; Unihan_OtherMappings ; PropertyValue
FileType ; Unihan_RadicalStrokeCounts ; PropertyValue
FileType ; Unihan_Readings ; PropertyValue
FileType ; Unihan_Variants ; PropertyValue
FileType ; NushuSources ; PropertyValue
FileType ; TangutSources ; PropertyValue

# NameAliases File Type
# Contains a multivalued property, where successive values are not in the same line, but are divided out on successive lines with the same code point

FileType ; NameAliases ; NameAliases
FileType ; NameAliasesProv ; NameAliases
FileType ; StandardizedVariants ; StandardizedVariants
FileType ; emoji-variation-sequences ; StandardizedVariants

# CJKRadicals File Type

Expand Down Expand Up @@ -309,6 +312,15 @@ Unihan_Variants ; kSpoofingVariant
Unihan_Variants ; kTraditionalVariant
Unihan_Variants ; kZVariant

NushuSources ; kSrc_NushuDuben
NushuSources ; kReading
TangutSources ; kRSTUnicode
TangutSources ; kTGT_MergedSrc

NormalizationCorrections ; NC_Original
NormalizationCorrections ; NC_Corrected
NormalizationCorrections ; NC_Version

# Extras

ScriptExtensions ; Script_Extensions
Expand All @@ -319,6 +331,7 @@ EmojiSources ; Emoji_SB ; 3
NamedSequences ; Named_Sequences
NamedSequencesProv ; Named_Sequences_Prov
StandardizedVariants ; Standardized_Variant
emoji-variation-sequences ; emoji-variation-sequence
DoNotEmit ; Do_Not_Emit_Preferred ; 1
DoNotEmit ; Do_Not_Emit_Type ; 2

Expand Down

0 comments on commit 2f29705

Please sign in to comment.