Skip to content

Commit

Permalink
Add NFKC_SCF (#423)
Browse files Browse the repository at this point in the history
  • Loading branch information
eggrobin authored May 3, 2023
1 parent b7ccbe6 commit 46c5231
Show file tree
Hide file tree
Showing 16 changed files with 6,236 additions and 105 deletions.
4 changes: 2 additions & 2 deletions .project
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,12 @@
</natures>
<filteredResources>
<filter>
<id>1625608051965</id>
<id>1683027586342</id>
<name></name>
<type>30</type>
<matcher>
<id>org.eclipse.core.resources.regexFilterMatcher</id>
<arguments>node_modules|.git|__CREATED_BY_JAVA_LANGUAGE_SERVER__</arguments>
<arguments>node_modules|\.git|__CREATED_BY_JAVA_LANGUAGE_SERVER__</arguments>
</matcher>
</filter>
</filteredResources>
Expand Down
4 changes: 2 additions & 2 deletions unicodetools/.project
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,12 @@
</natures>
<filteredResources>
<filter>
<id>1625517710081</id>
<id>1683027586324</id>
<name></name>
<type>30</type>
<matcher>
<id>org.eclipse.core.resources.regexFilterMatcher</id>
<arguments>node_modules|.git|__CREATED_BY_JAVA_LANGUAGE_SERVER__</arguments>
<arguments>node_modules|\.git|__CREATED_BY_JAVA_LANGUAGE_SERVER__</arguments>
</matcher>
</filter>
</filteredResources>
Expand Down
6,075 changes: 6,073 additions & 2 deletions unicodetools/data/ucd/dev/DerivedNormalizationProps.txt

Large diffs are not rendered by default.

5 changes: 3 additions & 2 deletions unicodetools/data/ucd/dev/PropertyAliases.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# PropertyAliases-15.1.0.txt
# Date: 2023-01-31, 22:27:07 GMT
# Date: 2023-03-23, 00:36:58 GMT
# © 2023 Unicode®, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
# For terms of use, see https://www.unicode.org/terms_of_use.html
Expand Down Expand Up @@ -73,6 +73,7 @@ EqUIdeo ; Equivalent_Unified_Ideograph
FC_NFKC ; FC_NFKC_Closure
lc ; Lowercase_Mapping
NFKC_CF ; NFKC_Casefold
NFKC_SCF ; NFKC_Simple_Casefold
scf ; Simple_Case_Folding ; sfc
slc ; Simple_Lowercase_Mapping
stc ; Simple_Titlecase_Mapping
Expand Down Expand Up @@ -210,6 +211,6 @@ XO_NFKC ; Expands_On_NFKC
XO_NFKD ; Expands_On_NFKD

# ================================================
# Total: 132
# Total: 133

# EOF
5 changes: 4 additions & 1 deletion unicodetools/data/ucd/dev/PropertyValueAliases.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# PropertyValueAliases-15.1.0.txt
# Date: 2023-04-26
# Date: 2023-05-02, 10:31:42 GMT
# © 2023 Unicode®, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
# For terms of use, see https://www.unicode.org/terms_of_use.html
Expand Down Expand Up @@ -1173,6 +1173,9 @@ NFKC_QC; M ; Maybe
NFKC_QC; N ; No
NFKC_QC; Y ; Yes

# NFKC_Simple_Casefold (NFKC_SCF)


# NFKD_Quick_Check (NFKD_QC)

NFKD_QC; N ; No
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,10 @@ public void put(
String value,
Merge<String> merger,
boolean hackHangul) {
if (value != null && value.isEmpty() && property != UcdProperty.NFKC_Casefold) {
if (value != null
&& value.isEmpty()
&& property != UcdProperty.NFKC_Casefold
&& property != UcdProperty.NFKC_Simple_Casefold) {
value = null;
}
value = normalizeAndVerify(value);
Expand Down Expand Up @@ -679,7 +682,9 @@ private static void parsePropertyValueFile(
: line.getParts()[2];
// The value should not be an empty string.
// Exception: NFKC_Casefold does remove some characters by mapping them to nothing.
assert !value.isEmpty() || propInfo.property == UcdProperty.NFKC_Casefold;
assert !value.isEmpty()
|| propInfo.property == UcdProperty.NFKC_Casefold
|| propInfo.property == UcdProperty.NFKC_Simple_Casefold;
if (propInfo.property == UcdProperty.kMandarin) {
if (indexUnicodeProperties.oldVersion) {
value =
Expand All @@ -695,6 +700,7 @@ private static void parsePropertyValueFile(
String defaultValue = null;
switch (propInfo.property) {
case NFKC_Casefold:
case NFKC_Simple_Casefold:
defaultValue = "<code point>";
break;
default:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ public enum UcdProperty {
Idn_Mapping(PropertyType.String, "idnm"),
Lowercase_Mapping(PropertyType.String, "lc"),
NFKC_Casefold(PropertyType.String, "NFKC_CF"),
NFKC_Simple_Casefold(PropertyType.String, "NFKC_SCF"),
Simple_Case_Folding(PropertyType.String, "scf", "sfc"),
Simple_Lowercase_Mapping(PropertyType.String, "slc"),
Simple_Titlecase_Mapping(PropertyType.String, "stc"),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1599,6 +1599,7 @@ public static NFKC_Quick_Check_Values forName(String name) {
}
}

// NFKC_Simple_Casefold
public enum NFKD_Quick_Check_Values implements Named {
No("N"),
Yes("Y");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,11 @@ public final String getFirstNameAlias() {

public final String getFirstValueAlias(String value) {
if (valueToFirstValueAlias == null) _getFirstValueAliasCache();
return valueToFirstValueAlias.get(value).toString();
String result = valueToFirstValueAlias.get(value);
if (result == null) {
throw new IllegalArgumentException(value + " is not a value alias for " + name);
}
return result;
}

private void _getFirstValueAliasCache() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,8 @@ public static void makeCaseFold(boolean normalized) throws java.io.IOException {
for (int ch = 0; ch <= 0x10FFFF; ++ch) {
Utility.dot(ch);

var normativeSCF = new StringBuffer();
var normativeCF = new StringBuffer();
var normativeSCF = new StringBuilder();
var normativeCF = new StringBuilder();

try {
if (!charsUsed.get(ch)) {
Expand Down Expand Up @@ -185,8 +185,8 @@ static void drawLine(
int ch,
String type,
String result,
StringBuffer normativeSCF,
StringBuffer normativeCF) {
StringBuilder normativeSCF,
StringBuilder normativeCF) {
String comment = "";
if (COMMENT_DIFFS) {
final String lower = Default.ucd().getCase(UTF16.valueOf(ch), FULL, LOWER);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -120,10 +120,11 @@ public static class PrintStyle {
boolean sortNumeric = false;

String parse(String options) {
options = options.replace('\t', ' ');
final String[] pieces = Utility.split(options, ' ');
for (int i = 1; i < pieces.length; ++i) {
final String piece = pieces[i];
Matcher matcher = Pattern.compile("([^\" \t]|\"[^\"]*\")+").matcher(options);
matcher.find();
String firstPiece = matcher.group();
while (matcher.find()) {
final String piece = matcher.group();
// binary
if (piece.equals("noLabel")) {
noLabel = true;
Expand Down Expand Up @@ -165,10 +166,10 @@ String parse(String options) {
skipUnassigned = afterEquals(piece);
} else if (piece.length() != 0) {
throw new IllegalArgumentException(
"Illegal PrintStyle Parameter: " + piece + " in " + pieces[0]);
"Illegal PrintStyle Parameter: " + piece + " in " + firstPiece);
}
}
return pieces[0];
return firstPiece;
}

private boolean afterEqualsBoolean(String piece) {
Expand Down Expand Up @@ -253,8 +254,26 @@ Map<String, String> getValue2CommentsMap(String property) {
return propertyToValueToComments.get(property);
}

// Returns strings without U+0022 QUOTATION MARK (") unchanged.
// Strings that contain " must be enclosed in them, and are returned unquoted, with "" as
// the escape sequence, thus:
// meow ↦ meow
// "meow" ↦ meow
// """meow""" ↦ "meow"
static String unquote(String source) {
String contents = source;
if (source.charAt(0) == '"' && source.charAt(source.length() - 1) == '"') {
contents = source.substring(1, source.length() - 1);
}
if (contents.matches("(?<!\")(\"\")*\"(?!\")")) {
throw new IllegalArgumentException(
"Syntax error: improper quotation marks in " + source);
}
return contents.replace("\"\"", "\"");
}

static String afterEquals(String source) {
return source.substring(source.indexOf('=') + 1);
return unquote(source.substring(source.indexOf('=') + 1));
}

static String afterWhitespace(String source) {
Expand Down Expand Up @@ -994,6 +1013,7 @@ public static void generateValueAliasFile(String filename) throws IOException {
if (propName.equals("Bidi_Mirroring_Glyph")
|| propName.equals("Equivalent_Unified_Ideograph")
|| propName.equals("NFKC_Casefold")
|| propName.equals("NFKC_Simple_Casefold")
|| propName.equals("Script_Extensions")) {
// Action item [172-A71]: Don't print @missing lines
// for properties whose specific data files already contain such lines.
Expand Down Expand Up @@ -1156,7 +1176,7 @@ public static void generatePropertyFile(String filename) throws IOException {
if (v == null) {
v = ps.skipUnassigned;
}
if (!v.equals("<codepoint>")) {
if (!v.equals("<code point>")) {
final String v2 = prop.getFirstValueAlias(v);
if (UnicodeProperty.compareNames(v, v2) != 0) {
v = v + " (" + v2 + ")";
Expand Down
Loading

0 comments on commit 46c5231

Please sign in to comment.