Skip to content

Commit

Permalink
Fix remaining issues with multivalued (#618)
Browse files Browse the repository at this point in the history
* Fix remaining issues with multivalued

* Fixed some review issues
  • Loading branch information
macchiati authored Nov 29, 2023
1 parent 8846d0d commit 7631d20
Show file tree
Hide file tree
Showing 5 changed files with 110 additions and 18 deletions.
32 changes: 27 additions & 5 deletions UnicodeJsps/src/main/java/org/unicode/jsp/ScriptTester.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.Normalizer;
import com.ibm.icu.text.UnicodeSet;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.Collection;
import java.util.Comparator;
Expand All @@ -20,6 +21,7 @@
import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Logger;
import java.util.regex.Pattern;
import org.unicode.props.UnicodeProperty;

/**
* Class for testing whether strings have allowed combinations of multiple scripts.
Expand Down Expand Up @@ -319,6 +321,7 @@ public static class ScriptExtensions {
public static final Comparator<BitSet> COMPARATOR =
new Comparator<BitSet>() {

@Override
public int compare(BitSet o1, BitSet o2) {
int diff = o1.cardinality() - o2.cardinality();
if (diff != 0) return diff;
Expand All @@ -344,6 +347,7 @@ private static class MyHandler extends FileUtilities.SemiFileReader {

UnicodeMap<BitSet> map = new UnicodeMap<BitSet>();

@Override
public boolean handleLine(int start, int end, String[] items) {
BitSet bitSet = new BitSet(LIMIT);
for (String script : SPACES.split(items[1])) {
Expand Down Expand Up @@ -429,21 +433,39 @@ public static UnicodeMap<String> getScriptSpecialsNames() {
return result;
}

public static String[][] getScriptSpecialsAlternates() {
public static String[][] getScriptSpecialsAlternates(UnicodeProperty scriptProp) {
Collection<BitSet> availableValues = getScriptSpecials().getAvailableValues();
String[][] result = new String[availableValues.size()][];
List<String[]> result = new ArrayList<>();
Set<String> names = new TreeSet<String>(); // to alphabetize

int i = 0;
for (BitSet value : availableValues) {
String baseName =
ScriptExtensions.getNames(value, UProperty.NameChoice.LONG, ",", names);
String altName =
ScriptExtensions.getNames(value, UProperty.NameChoice.SHORT, ",", names);
String[] row = {baseName, altName};
result[i++] = row;
result.add(row);
}
return result;

// Get the single values, and build alternate values for the property, for isValidValue
// of a single script (eg Arab)
List<String> values = scriptProp.getAvailableValues();
for (String value : values) {
List<String> row = new ArrayList<>();
row.add(value);
for (String alias : scriptProp.getValueAliases(value)) {
if (!alias.equals(value)) {
row.add(alias);
}
}
// duplicate it whenever singular, because the tooling expects at least 2 values (ugg)
if (row.size() == 1) {
row.add(value);
}
result.add(row.toArray(new String[row.size()]));
}

return result.toArray(new String[result.size()][]);
}

private ScriptTester(UnicodeMap<BitSet> character_scripts) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@ public MySymbolTable() {
// return null;
// }

@Override
public boolean applyPropertyAlias(
String propertyName, String propertyValue, UnicodeSet result) {
boolean status = false;
Expand Down Expand Up @@ -201,9 +202,11 @@ public boolean applyPropertyAlias(
}
;
if (!status) {
try {
status = applyPropertyAlias0(prop, "No", result, !invert);
} catch (Exception e) {
if (prop.isType(UnicodeProperty.BINARY_OR_ENUMERATED_OR_CATALOG_MASK)) {
try {
status = applyPropertyAlias0(prop, "No", result, !invert);
} catch (Exception e) {
}
}
;
if (!status) {
Expand Down Expand Up @@ -336,6 +339,7 @@ public ComparisonMatcher(String pattern, Relation comparator) {
this.pattern = pattern;
}

@Override
public boolean test(String value) {
int comp = comparator.compare(pattern, value.toString());
switch (relation) {
Expand All @@ -352,6 +356,7 @@ public boolean test(String value) {
}
}

@Override
public PatternMatcher set(String pattern) {
this.pattern = pattern;
return this;
Expand Down
14 changes: 7 additions & 7 deletions UnicodeJsps/src/main/java/org/unicode/jsp/XPropertyFactory.java
Original file line number Diff line number Diff line change
Expand Up @@ -269,15 +269,18 @@ public String transform(Integer source) {

// set up the special script property
UnicodeProperty scriptProp = base.getProperty("sc");

// Compose the function and add
UnicodeMap<String> specialMap = new UnicodeMap<String>();
specialMap.putAll(scriptProp.getUnicodeMap());
specialMap.putAll(
scriptProp.getUnicodeMap()); // if there is no value, use the script property
specialMap.putAll(ScriptTester.getScriptSpecialsNames());
add(
new UnicodeProperty.UnicodeMapProperty()
.set(specialMap)
.setMain("Script_Extensions", "scx", UnicodeProperty.ENUMERATED, "1.1")
.addValueAliases(
ScriptTester.getScriptSpecialsAlternates(),
ScriptTester.getScriptSpecialsAlternates(scriptProp),
AliasAddAction.IGNORE_IF_MISSING)
.setMultivalued(true));

Expand Down Expand Up @@ -359,6 +362,7 @@ private void addExamplarProperty(

// convert to UnicodeMap
UnicodeMap<String> unicodeMap = new UnicodeMap<>();
unicodeMap.putAll(0, 0x10FFFF, ""); // default is empty string
for (Entry<Integer, Collection<String>> entry : data.asMap().entrySet()) {
String value = JOIN_COMMAS.join(entry.getValue()).intern();
unicodeMap.put(entry.getKey(), value);
Expand All @@ -383,11 +387,7 @@ private void addExamplarProperty(
add(
new UnicodeProperty.UnicodeMapProperty()
.set(unicodeMap)
.setMain(
propertyName,
propertyAbbreviation,
UnicodeProperty.ENUMERATED,
"1.1")
.setMain(propertyName, propertyAbbreviation, UnicodeProperty.STRING, "1.1")
.addValueAliases(locales, AliasAddAction.ADD_MAIN_ALIAS)
.setMultivalued(true));
}
Expand Down
57 changes: 57 additions & 0 deletions UnicodeJsps/src/test/java/org/unicode/jsptest/TestMultivalued.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,23 @@
import com.ibm.icu.text.UnicodeSet;
import org.junit.jupiter.api.Test;
import org.unicode.jsp.UnicodeSetUtilities;
import org.unicode.jsp.XPropertyFactory;
import org.unicode.props.UnicodeProperty;
import org.unicode.unittest.TestFmwkMinusMinus;

public class TestMultivalued extends TestFmwkMinusMinus {

private static final boolean DEBUG = false;

UnicodeProperty exemplarProp = XPropertyFactory.make().getProperty("exemplar");
UnicodeProperty scxProp = XPropertyFactory.make().getProperty("scx");

@Test
public void TestScx1Script() {
if (DEBUG) {
String x = scxProp.getValue('।');
}

String unicodeSetString = "\\p{scx=deva}";
UnicodeSet parsed = UnicodeSetUtilities.parseUnicodeSet(unicodeSetString);

Expand All @@ -20,6 +32,20 @@ public void TestScx1Script() {
parsed.containsAll(mustNotContain));
}

@Test
public void TestScx1ScriptB() {
String unicodeSetString = "\\p{scx=Arab}";
UnicodeSet parsed = UnicodeSetUtilities.parseUnicodeSet(unicodeSetString);

UnicodeSet mustContain = new UnicodeSet("[،ء]"); // one character single script, one multi
assertTrue(unicodeSetString + " contains " + mustContain, parsed.containsAll(mustContain));

UnicodeSet mustNotContain = new UnicodeSet("[ক]"); // one Bangla character
assertFalse(
unicodeSetString + " !contains " + mustNotContain,
parsed.containsAll(mustNotContain));
}

@Test
public void TestScxMulti() {
String unicodeSetString = "\\p{scx=beng,deva}";
Expand All @@ -37,6 +63,10 @@ public void TestScxMulti() {

@Test
public void TestExemplars() {
if (DEBUG) {
String x = exemplarProp.getValue('æ');
}

String unicodeSetString = "\\p{exem=da}";
UnicodeSet parsed = UnicodeSetUtilities.parseUnicodeSet(unicodeSetString);

Expand All @@ -48,4 +78,31 @@ public void TestExemplars() {
unicodeSetString + " !contains " + mustNotContain,
parsed.containsAll(mustNotContain));
}

@Test
public void TestEmpty() {
assertEquals("exemplar(0x0000)", "", exemplarProp.getValue(0x0000));
assertEquals("exemplar(α)", "el", exemplarProp.getValue('α'));

UnicodeSet exem = UnicodeSetUtilities.parseUnicodeSet("\\p{exem}");
assertTrue("\\p{exem} contains 0", exem.contains(0x0000));
assertFalse("\\p{exem} contains α", exem.contains('α'));
UnicodeSet exem3 = UnicodeSetUtilities.parseUnicodeSet("\\p{exem=el}");
assertFalse("\\p{exem=el} contains 0", exem3.contains(0x0000));
assertTrue("\\p{exem=el} contains α", exem3.contains('α'));

String unicodeSetString = "[\\p{Greek}&\\p{exem}]";
UnicodeSet parsed = UnicodeSetUtilities.parseUnicodeSet(unicodeSetString);

String first = parsed.iterator().next();
String firstValue = exemplarProp.getValue(first.codePointAt(0));
assertEquals(unicodeSetString, "", firstValue);

String unicodeSetString2 = "[\\p{Greek}&\\P{exem}]";
UnicodeSet parsed2 = UnicodeSetUtilities.parseUnicodeSet(unicodeSetString2);

String first2 = parsed2.iterator().next();
String firstValue2 = exemplarProp.getValue(first2.codePointAt(0));
assertEquals(unicodeSetString2, "el", firstValue2);
}
}
14 changes: 11 additions & 3 deletions unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,14 @@ public UnicodeProperty setMultivalued(boolean value) {
(1 << ENUMERATED)
| (1 << EXTENDED_ENUMERATED)
| (1 << CATALOG)
| (1 << EXTENDED_CATALOG);
| (1 << EXTENDED_CATALOG),
BINARY_OR_ENUMERATED_OR_CATALOG_MASK =
(1 << ENUMERATED)
| (1 << EXTENDED_ENUMERATED)
| (1 << CATALOG)
| (1 << EXTENDED_CATALOG)
| (1 << BINARY)
| (1 << EXTENDED_BINARY);

private static final String[] TYPE_NAMES = {
"Unknown",
Expand Down Expand Up @@ -405,7 +412,7 @@ public final UnicodeSet getSet(String propertyValue, UnicodeSet result) {
public UnicodeSet getSet(PatternMatcher matcher, UnicodeSet result) {
if (result == null) result = new UnicodeSet();
boolean uniformUnassigned = hasUniformUnassigned();
if (isType(STRING_OR_MISC_MASK)) {
if (isType(STRING_OR_MISC_MASK) && !isMultivalued) {
for (UnicodeSetIterator usi = getStuffToTest(uniformUnassigned);
usi.next(); ) { // int i = 0; i <= 0x10FFFF; ++i
int i = usi.codepoint;
Expand All @@ -423,7 +430,8 @@ public UnicodeSet getSet(PatternMatcher matcher, UnicodeSet result) {
while (it.hasNext()) {
String value = it.next();
temp.clear();
Iterator<String> it2 = getValueAliases(value, temp).iterator();
final List<String> valueAliases = getValueAliases(value, temp);
Iterator<String> it2 = valueAliases.iterator();
while (it2.hasNext()) {
String value2 = it2.next();
// System.out.println("Values:" + value2);
Expand Down

0 comments on commit 7631d20

Please sign in to comment.