Skip to content

Commit

Permalink
Miscellaneous fixes to IUP (#681)
Browse files Browse the repository at this point in the history
  • Loading branch information
eggrobin authored Feb 6, 2024
1 parent 62bb9f6 commit 04fb3b8
Show file tree
Hide file tree
Showing 4 changed files with 72 additions and 56 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
import com.ibm.icu.text.Normalizer2;
import com.ibm.icu.text.Transform;
import com.ibm.icu.text.Transliterator;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.ICUException;
import com.ibm.icu.util.VersionInfo;
Expand Down Expand Up @@ -535,28 +534,12 @@ private UnicodeMap<String> getCachedMap(UcdProperty prop2, String sourceFileName

public static String getResolvedValue(
IndexUnicodeProperties props, UcdProperty prop, String codepoint, String value) {
if (value == null && props != null) {
if (getResolvedDefaultValueType(prop) == DefaultValueType.CODE_POINT) {
return codepoint;
}
}
if (prop == UcdProperty.Name && value != null && value.endsWith("-#")) {
return value.substring(0, value.length() - 1) + Utility.hex(codepoint);
}
return value;
return props.getProperty(prop).getValue(codepoint.codePointAt(0));
}

public static String getResolvedValue(
IndexUnicodeProperties props, UcdProperty prop, int codepoint, String value) {
if (value == null && props != null) {
if (getResolvedDefaultValueType(prop) == DefaultValueType.CODE_POINT) {
return UTF16.valueOf(codepoint);
}
}
if (prop == UcdProperty.Name && value != null && value.endsWith("-#")) {
return value.substring(0, value.length() - 1) + Utility.hex(codepoint);
}
return value;
return props.getProperty(prop).getValue(codepoint);
}

UnicodeMap<String> nameMap;
Expand Down Expand Up @@ -693,17 +676,66 @@ class IndexUnicodeProperty extends UnicodeProperty.BaseProperty {
}
}

@Override
public boolean isTrivial() {
return _getRawUnicodeMap().isEmpty()
|| _getRawUnicodeMap().keySet("").equals(UnicodeSet.ALL_CODE_POINTS);
}

@Override
protected UnicodeMap<String> _getUnicodeMap() {
var raw = _getRawUnicodeMap();
if (prop == UcdProperty.Name
|| raw.containsValue("<code point>")
|| raw.containsValue("<codepoint>")) {
final long start = System.currentTimeMillis();
UnicodeMap<String> newMap = new UnicodeMap<>();
for (UnicodeMap.EntryRange<String> range : raw.entryRanges()) {
if (range.codepoint == -1) {
newMap.put(range.string, range.value);
} else if (DefaultValueType.forString(range.value)
== DefaultValueType.CODE_POINT
|| (prop == UcdProperty.Name && range.value.endsWith("#"))) {
for (int c = range.codepoint; c <= range.codepointEnd; ++c) {
newMap.put(c, resolveValue(range.value, c));
}
} else {
newMap.putAll(range.codepoint, range.codepointEnd, range.value);
}
}
final long stop = System.currentTimeMillis();
final long Δt_in_ms = stop - start;
// We do not want to construct these UnicodeMaps that map most of the code space to
// itself, not so much because building them is costly, but because whatever we do
// on them is almost certainly a bad idea (for instance calling `values()` will be
// extremely slow). Log a trace so we can figure out where we are using this.
System.out.println(
"Built " + prop + " " + ucdVersion + " map in " + Δt_in_ms + " ms");
new Throwable().printStackTrace(System.out);

return newMap;
} else {
return raw;
}
}

protected UnicodeMap<String> _getRawUnicodeMap() {
return load(prop);
}

@Override
protected String _getValue(int codepoint) {
final String result = _getUnicodeMap().get(codepoint);
if (DefaultValueType.forString(result) == DefaultValueType.CODE_POINT) {
final String result = _getRawUnicodeMap().get(codepoint);
return resolveValue(result, codepoint);
}

private String resolveValue(String rawValue, int codepoint) {
if (DefaultValueType.forString(rawValue) == DefaultValueType.CODE_POINT) {
return Character.toString(codepoint);
} else if (prop == UcdProperty.Name && rawValue != null && rawValue.endsWith("#")) {
return rawValue.substring(0, rawValue.length() - 1) + Utility.hex(codepoint);
} else {
return result;
return rawValue;
}
}

Expand Down Expand Up @@ -731,6 +763,7 @@ protected List _getValueAliases(String valueAlias, List result) {
}
}
if (!result.contains(valueAlias)) {
// TODO(egg): We should not be constructing this map for this.
if (_getUnicodeMap().containsValue(valueAlias)) {
result.add(valueAlias);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ public ShimUnicodePropertyFactory(IndexUnicodeProperties factory) {
UnicodeProperty prop = factory.getProperty(propName);
switch (propName) {
case "Bidi_Mirroring_Glyph":
// The default is <none> in BidiMirroring.txt, but TUP incorrectly has it as
// <code point>.
prop =
replaceCpValues(
prop,
Expand All @@ -68,22 +70,21 @@ public ShimUnicodePropertyFactory(IndexUnicodeProperties factory) {
prop = replaceValues(prop, oldValue -> oldValue == null ? "\u0000" : oldValue);
break;
case "FC_NFKC_Closure":
// The default is <code point> in PropertyValueAliases.txt, but TUP incorrectly
// has it as <none>.
prop =
replaceCpValues(
prop, (cp, oldValue) -> fixFC_NFKC_Closure(cp, oldValue));

break;
case "Joining_Type":
prop = replaceCpValues(prop, (cp, oldValue) -> fixJoining_Type(cp, oldValue));
break;
case "Joining_Group":
prop = modifyJoining_Group(prop);
break;
case "Jamo_Short_Name":
prop = modifyJamo_Short_Name(prop);
break;
case "Name":
// prop = modifyName(prop);
// TUP reports the special label <control-XXXX> as the value of the Name
// property. This is incorrect, the actual value of the Name property is "" for
// those, see https://www.unicode.org/versions/latest/ch04.pdf#G135245 and
// https://www.unicode.org/reports/tr44/#Name.
prop = replaceCpValues(prop, (cp, x) -> fixName(cp, x));
break;
case "Numeric_Value":
Expand Down Expand Up @@ -301,40 +302,24 @@ private String fixNumericValue(String value) {
private String fixName(int cp, String value) {
if (control.contains(cp)) {
return "<control-" + Utility.hex(cp, 4) + ">";
} else if (value != null && value.contains("#")) {
return value.replace("#", Utility.hex(cp, 4));
} else {
return value;
}
}

private String fixFC_NFKC_Closure(int cp, String oldValue) {
if (oldValue.equals("<code point>") || equalsString(cp, oldValue)) {
if (equalsString(cp, oldValue)) {
return null;
} else {
return oldValue;
}
}

// Joining_Type needs fix in IUP
private String fixJoining_Type(int cp, String oldValue) {
if (defaultTransparent.contains(cp) && "Non_Joining".equals(oldValue)) {
return "Transparent";
} else {
return oldValue;
}
}

// Jamo_Short_Name needs fix in IUP
private UnicodeProperty modifyJamo_Short_Name(UnicodeProperty prop) {
return copyPropReplacingMap(prop, prop.getUnicodeMap().put('ᄋ', ""));
}

// Joining_Group needs fix in IUP (really, in UCD data)
private UnicodeProperty modifyJoining_Group(UnicodeProperty prop) {
return copyPropReplacingMap(prop, prop.getUnicodeMap().put('ۃ', "Teh_Marbuta_Goal"));
}

/** Very useful. May already be in ICU, but not sure. */
public boolean equalsString(int codepoint, String value) {
return codepoint == value.codePointAt(0)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -567,6 +567,12 @@ public UnicodeMap<String> getUnicodeMap() {
return getUnicodeMap(false);
}

public boolean isTrivial() {
final var map = getUnicodeMap();
return map.isEmpty()
|| map.keySet("").equals(UnicodeSet.ALL_CODE_POINTS) && map.stringKeys().isEmpty();
}

/**
* @return the unicode map
*/
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
package org.unicode.text.UCD;

import com.ibm.icu.dev.util.UnicodeMap;
import com.ibm.icu.text.SymbolTable;
import com.ibm.icu.text.UnicodeSet;
import java.text.ParsePosition;
Expand Down Expand Up @@ -63,13 +62,6 @@ public static VersionedProperty forJSPs() {
private static final Set<String> TOOL_ONLY_PROPERTIES =
Set.of("toNFC", "toNFD", "toNFKC", "toNFKD");

private static boolean isTrivial(UnicodeMap<String> map) {
return map.isEmpty()
|| (map.values().size() == 1
&& map.getSet(map.values().iterator().next())
.equals(UnicodeSet.ALL_CODE_POINTS));
}

public UnicodeProperty getProperty() {
return property;
}
Expand Down Expand Up @@ -112,11 +104,11 @@ public VersionedProperty set(String xPropertyName) {
propSource = getIndexedProperties(version);
property = propSource.getProperty(xPropertyName);
if ((property == null && TOOL_ONLY_PROPERTIES.contains(xPropertyName))
|| (property != null && isTrivial(property.getUnicodeMap()) && allowRetroactive)) {
|| (property != null && property.isTrivial() && allowRetroactive)) {
propSource = ToolUnicodePropertySource.make(version);
property = propSource.getProperty(xPropertyName);
}
if (property == null || isTrivial(property.getUnicodeMap())) {
if (property == null || property.isTrivial()) {
if (!throwOnUnknownProperty) {
return null;
}
Expand Down

0 comments on commit 04fb3b8

Please sign in to comment.