Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix JSP failures with scx #615

Merged
merged 6 commits into from
Nov 28, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
114 changes: 113 additions & 1 deletion UnicodeJsps/src/main/java/org/unicode/jsp/XPropertyFactory.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
package org.unicode.jsp;

import com.google.common.base.Joiner;
import com.google.common.collect.Multimap;
import com.google.common.collect.TreeMultimap;
import com.ibm.icu.dev.util.UnicodeMap;
import com.ibm.icu.dev.util.UnicodeMap.EntryRange;
macchiati marked this conversation as resolved.
Show resolved Hide resolved
import com.ibm.icu.lang.CharSequences;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty.NameChoice;
Expand All @@ -12,13 +16,19 @@
import com.ibm.icu.text.Transform;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.text.UnicodeSetIterator;
import com.ibm.icu.util.LocaleData;
import com.ibm.icu.util.ULocale;
import com.ibm.icu.util.VersionInfo;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.Locale;
import java.util.Map.Entry;
markusicu marked this conversation as resolved.
Show resolved Hide resolved
import java.util.Set;
import java.util.TreeSet;
import org.unicode.idna.Idna.IdnaType;
import org.unicode.idna.Idna2003;
import org.unicode.idna.Idna2008;
Expand All @@ -28,9 +38,13 @@
import org.unicode.props.UnicodeProperty.BaseProperty;
import org.unicode.props.UnicodeProperty.Factory;
import org.unicode.props.UnicodeProperty.SimpleProperty;
import org.unicode.text.utility.Utility;

public class XPropertyFactory extends UnicodeProperty.Factory {

private static final Joiner JOIN_COMMAS = Joiner.on(",");
private static final boolean DEBUG_MULTI = false;

static final UnicodeSet ALL =
new UnicodeSet("[[:^C:][:Cc:][:Cf:][:noncharactercodepoint:]]").freeze();

Expand Down Expand Up @@ -96,6 +110,7 @@ public final Factory add2(UnicodeProperty sp) {
add(
new CodepointTransformProperty(
new Transform<Integer, String>() {
@Override
public String transform(Integer source) {
return Normalizer.normalize(source, Normalizer.NFC);
}
Expand All @@ -105,6 +120,7 @@ public String transform(Integer source) {
add(
new CodepointTransformProperty(
new Transform<Integer, String>() {
@Override
public String transform(Integer source) {
return Normalizer.normalize(source, Normalizer.NFD);
}
Expand All @@ -114,6 +130,7 @@ public String transform(Integer source) {
add(
new CodepointTransformProperty(
new Transform<Integer, String>() {
@Override
public String transform(Integer source) {
return Normalizer.normalize(source, Normalizer.NFKC);
}
Expand All @@ -123,6 +140,7 @@ public String transform(Integer source) {
add(
new CodepointTransformProperty(
new Transform<Integer, String>() {
@Override
public String transform(Integer source) {
return Normalizer.normalize(source, Normalizer.NFKD);
}
Expand All @@ -133,6 +151,7 @@ public String transform(Integer source) {
add(
new StringTransformProperty(
new StringTransform() {
@Override
public String transform(String source) {
return UCharacter.foldCase(source, true);
}
Expand All @@ -142,6 +161,7 @@ public String transform(String source) {
add(
new StringTransformProperty(
new StringTransform() {
@Override
public String transform(String source) {
return UCharacter.toLowerCase(ULocale.ROOT, source);
}
Expand All @@ -151,6 +171,7 @@ public String transform(String source) {
add(
new StringTransformProperty(
new StringTransform() {
@Override
public String transform(String source) {
return UCharacter.toUpperCase(ULocale.ROOT, source);
}
Expand All @@ -160,6 +181,7 @@ public String transform(String source) {
add(
new StringTransformProperty(
new StringTransform() {
@Override
public String transform(String source) {
return UCharacter.toTitleCase(ULocale.ROOT, source, null);
}
Expand All @@ -170,6 +192,7 @@ public String transform(String source) {
add(
new StringTransformProperty(
new StringTransform() {
@Override
public String transform(String source) {
StringBuilder b = new StringBuilder();
for (int cp : CharSequences.codePoints(source)) {
Expand All @@ -184,6 +207,7 @@ public String transform(String source) {
add(
new StringTransformProperty(
new StringTransform() {
@Override
public String transform(String source) {
String result = NFM.nfm.get(source);
return result == null ? source : result;
Expand All @@ -201,6 +225,7 @@ public String transform(String source) {
add(
new CodepointTransformProperty(
new Transform<Integer, String>() {
@Override
public String transform(Integer source) {
return UnicodeUtilities.getSubheader().getSubheader(source);
}
Expand Down Expand Up @@ -239,6 +264,9 @@ public String transform(Integer source) {
.setMain("bmp", "bmp", UnicodeProperty.BINARY, "6.0"));

addCollationProperty();
addExamplarProperty(LocaleData.ES_STANDARD, "exem", "exemplar");
addExamplarProperty(LocaleData.ES_AUXILIARY, "exema", "exemplar_aux");
addExamplarProperty(LocaleData.ES_PUNCTUATION, "exemp", "exemplar_punct");

// set up the special script property
UnicodeProperty scriptProp = base.getProperty("sc");
Expand All @@ -251,7 +279,8 @@ public String transform(Integer source) {
.setMain("Script_Extensions", "scx", UnicodeProperty.ENUMERATED, "1.1")
.addValueAliases(
ScriptTester.getScriptSpecialsAlternates(),
AliasAddAction.IGNORE_IF_MISSING));
AliasAddAction.IGNORE_IF_MISSING)
.setMultivalued(true));

CachedProps cp = CachedProps.CACHED_PROPS;
for (String prop : cp.getAvailable()) {
Expand Down Expand Up @@ -289,6 +318,81 @@ public String transform(Integer source) {
.setMain("RGI_Emoji", "RGI_Emoji", UnicodeProperty.BINARY, "13.0"));
}

private void addExamplarProperty(
int exemplarType, String propertyAbbreviation, String propertyName) {
Multimap<Integer, String> data = TreeMultimap.create();
Set<String> localeSet = new TreeSet<>();

for (ULocale ulocale : ULocale.getAvailableLocales()) {
if (!ulocale.getCountry().isEmpty()) {
continue;
// we want to skip cases where characters are in the parent locale, but there is no
// ULocale parentLocale = ulocale.getParent();
}
UnicodeSet exemplarSet = LocaleData.getExemplarSet(ulocale, 0, exemplarType);
if (!ulocale.getScript().isEmpty()) {
// we can't find out the parent locale or defaultContent locale in ICU, so we hack
// it
String langLocale = ulocale.getLanguage();
UnicodeSet langExemplarSet =
LocaleData.getExemplarSet(new ULocale(langLocale), 0, exemplarType);
if (langExemplarSet.equals(exemplarSet)) {
continue;
}
}
String locale = ulocale.toString();
markusicu marked this conversation as resolved.
Show resolved Hide resolved
localeSet.add(locale);
for (UnicodeSetIterator it = new UnicodeSetIterator(exemplarSet); it.nextRange(); ) {
if (it.codepoint == UnicodeSetIterator.IS_STRING) {
// flatten
int cp = 0;
for (int i = 0; i < it.string.length(); i += Character.charCount(cp)) {
cp = it.string.codePointAt(i);
data.put(cp, locale);
}
} else {
for (int cp = it.codepoint; cp <= it.codepointEnd; ++cp) {
markusicu marked this conversation as resolved.
Show resolved Hide resolved
data.put(cp, locale);
}
}
}
}

// convert to UnicodeMap
UnicodeMap<String> unicodeMap = new UnicodeMap<>();
for (Entry<Integer, Collection<String>> entry : data.asMap().entrySet()) {
String value = JOIN_COMMAS.join(entry.getValue()).intern();
unicodeMap.put(entry.getKey(), value);
}
if (DEBUG_MULTI) {
System.out.println("\n" + propertyName);
for (EntryRange<String> entry : unicodeMap.entryRanges()) {
System.out.println(
Utility.hex(entry.codepoint)
+ (entry.codepoint == entry.codepointEnd
? ""
: "-" + Utility.hex(entry.codepointEnd))
+ " ;\t"
+ entry.value);
}
}

// put locales into right format
String[] localeList = localeSet.toArray(new String[localeSet.size()]);
String[][] locales = new String[][] {localeList, localeList}; // abbreviations are the same

add(
new UnicodeProperty.UnicodeMapProperty()
.set(unicodeMap)
.setMain(
propertyName,
propertyAbbreviation,
UnicodeProperty.ENUMERATED,
"1.1")
.addValueAliases(locales, AliasAddAction.ADD_MAIN_ALIAS)
.setMultivalued(true));
}

private void addCollationProperty() {
RuleBasedCollator c = UnicodeSetUtilities.RAW_COLLATOR;
// (RuleBasedCollator) Collator.getInstance(ULocale.ROOT);
Expand Down Expand Up @@ -652,6 +756,7 @@ public StringTransformProperty(
setUniformUnassigned(hasUniformUnassigned);
}

@Override
protected String _getValue(int codepoint) {
return transform.transform(UTF16.valueOf(codepoint));
}
Expand All @@ -666,6 +771,7 @@ public CodepointTransformProperty(
setUniformUnassigned(hasUniformUnassigned);
}

@Override
protected String _getValue(int codepoint) {
return transform.transform(codepoint);
}
Expand All @@ -682,6 +788,7 @@ public static class EncodingProperty extends SimpleProperty {
encoder = new CharEncoder(charset, false, false);
}

@Override
protected String _getValue(int codepoint) {
int len = encoder.getValue(codepoint, temp, 0);
if (len < 0) {
Expand All @@ -697,6 +804,7 @@ protected String _getValue(int codepoint) {
return result.toString();
}

@Override
public boolean isDefault(int codepoint) {
int len = encoder.getValue(codepoint, temp, 0);
return len < 0;
Expand All @@ -716,6 +824,7 @@ public static class EncodingPropertyBoolean extends SimpleProperty {
encoder = new CharEncoder(charset, true, true);
}

@Override
protected String _getValue(int codepoint) {
return (encoder.getValue(codepoint, null, 0) > 0) ? "Yes" : "No";
}
Expand All @@ -731,6 +840,7 @@ public XPropertyFactory.UnicodeSetProperty set(UnicodeSet set) {
return this;
}

@Override
protected UnicodeMap<String> _getUnicodeMap() {
UnicodeMap<String> result = new UnicodeMap<String>();
result.putAll(unicodeSet, "Yes");
Expand All @@ -743,10 +853,12 @@ public XPropertyFactory.UnicodeSetProperty set(String string) {
return set(new UnicodeSet(string).freeze());
}

@Override
protected String _getValue(int codepoint) {
return YESNO_ARRAY[unicodeSet.contains(codepoint) ? 0 : 1];
}

@Override
protected List _getAvailableValues(List result) {
return YESNO;
}
Expand Down
53 changes: 53 additions & 0 deletions UnicodeJsps/src/test/java/org/unicode/jsptest/TestMultivalued.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
package org.unicode.jsptest;

import com.ibm.icu.text.UnicodeSet;
import org.junit.jupiter.api.Test;
import org.unicode.jsp.UnicodeSetUtilities;
import org.unicode.unittest.TestFmwkMinusMinus;

public class TestMultivalued extends TestFmwkMinusMinus {
@Test
public void TestScx1Script() {
// As of 2023-11-24, scx was not working properly
markusicu marked this conversation as resolved.
Show resolved Hide resolved
String unicodeSetString = "\\p{scx=deva}";
UnicodeSet parsed = UnicodeSetUtilities.parseUnicodeSet(unicodeSetString);

UnicodeSet mustContain = new UnicodeSet("[ᳵ।]"); // one character B&D, other B&D&D&G&...
assertTrue(unicodeSetString + " contains " + mustContain, parsed.containsAll(mustContain));

UnicodeSet mustNotContain = new UnicodeSet("[ক]"); // one Bengali character
markusicu marked this conversation as resolved.
Show resolved Hide resolved
assertFalse(
unicodeSetString + " !contains " + mustNotContain,
parsed.containsAll(mustNotContain));
}

@Test
public void TestScxMulti() {
// As of 2023-11-24, scx was not working properly
markusicu marked this conversation as resolved.
Show resolved Hide resolved
String unicodeSetString = "\\p{scx=beng,deva}";
String exceptionMessage = null;
try {
UnicodeSet parsed = UnicodeSetUtilities.parseUnicodeSet(unicodeSetString);
} catch (Exception e) {
exceptionMessage = e.getMessage();
}
assertEquals(
"Expected exception",
"Multivalued property values can't contain commas.",
exceptionMessage);
}

@Test
public void TestExemplars() {
String unicodeSetString = "\\p{exem=da}";
UnicodeSet parsed = UnicodeSetUtilities.parseUnicodeSet(unicodeSetString);

UnicodeSet mustContain = new UnicodeSet("[æ]");
assertTrue(unicodeSetString + " contains " + mustContain, parsed.containsAll(mustContain));

UnicodeSet mustNotContain = new UnicodeSet("[ç]");
assertFalse(
unicodeSetString + " !contains " + mustNotContain,
parsed.containsAll(mustNotContain));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -413,14 +413,6 @@ public void TestPerMill(final String name, final Charset charset) {
}
}

@Test
public void TestScriptSpecials() {
// UnicodeSet set = UnicodeSetUtilities.parseUnicodeSet("[:scs=Hant:]");
// assertNotEquals("Hant", 0, set.size());
UnicodeSet set2 = UnicodeSetUtilities.parseUnicodeSet("[:scx=Arab,Syrc:]");
assertNotEquals("Arab Syrc", 0, set2.size());
}

@Test
public void TestGC() {
Map<String, R2<String, UnicodeSet>> SPECIAL_GC =
Expand Down
Loading
Loading