From 7631d20d618d20e43713060d9916cfd6590a8f89 Mon Sep 17 00:00:00 2001 From: Mark Davis Date: Wed, 29 Nov 2023 14:22:41 -0800 Subject: [PATCH] Fix remaining issues with multivalued (#618) * Fix remaining issues with multivalued * Fixed some review issues --- .../java/org/unicode/jsp/ScriptTester.java | 32 +++++++++-- .../org/unicode/jsp/UnicodeSetUtilities.java | 11 +++- .../org/unicode/jsp/XPropertyFactory.java | 14 ++--- .../org/unicode/jsptest/TestMultivalued.java | 57 +++++++++++++++++++ .../org/unicode/props/UnicodeProperty.java | 14 ++++- 5 files changed, 110 insertions(+), 18 deletions(-) diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/ScriptTester.java b/UnicodeJsps/src/main/java/org/unicode/jsp/ScriptTester.java index 8da9c246b..f5dd6d9e6 100644 --- a/UnicodeJsps/src/main/java/org/unicode/jsp/ScriptTester.java +++ b/UnicodeJsps/src/main/java/org/unicode/jsp/ScriptTester.java @@ -7,6 +7,7 @@ import com.ibm.icu.lang.UScript; import com.ibm.icu.text.Normalizer; import com.ibm.icu.text.UnicodeSet; +import java.util.ArrayList; import java.util.BitSet; import java.util.Collection; import java.util.Comparator; @@ -20,6 +21,7 @@ import java.util.concurrent.atomic.AtomicInteger; import java.util.logging.Logger; import java.util.regex.Pattern; +import org.unicode.props.UnicodeProperty; /** * Class for testing whether strings have allowed combinations of multiple scripts. @@ -319,6 +321,7 @@ public static class ScriptExtensions { public static final Comparator COMPARATOR = new Comparator() { + @Override public int compare(BitSet o1, BitSet o2) { int diff = o1.cardinality() - o2.cardinality(); if (diff != 0) return diff; @@ -344,6 +347,7 @@ private static class MyHandler extends FileUtilities.SemiFileReader { UnicodeMap map = new UnicodeMap(); + @Override public boolean handleLine(int start, int end, String[] items) { BitSet bitSet = new BitSet(LIMIT); for (String script : SPACES.split(items[1])) { @@ -429,21 +433,39 @@ public static UnicodeMap getScriptSpecialsNames() { return result; } - public static String[][] getScriptSpecialsAlternates() { + public static String[][] getScriptSpecialsAlternates(UnicodeProperty scriptProp) { Collection availableValues = getScriptSpecials().getAvailableValues(); - String[][] result = new String[availableValues.size()][]; + List result = new ArrayList<>(); Set names = new TreeSet(); // to alphabetize - int i = 0; for (BitSet value : availableValues) { String baseName = ScriptExtensions.getNames(value, UProperty.NameChoice.LONG, ",", names); String altName = ScriptExtensions.getNames(value, UProperty.NameChoice.SHORT, ",", names); String[] row = {baseName, altName}; - result[i++] = row; + result.add(row); } - return result; + + // Get the single values, and build alternate values for the property, for isValidValue + // of a single script (eg Arab) + List values = scriptProp.getAvailableValues(); + for (String value : values) { + List row = new ArrayList<>(); + row.add(value); + for (String alias : scriptProp.getValueAliases(value)) { + if (!alias.equals(value)) { + row.add(alias); + } + } + // duplicate it whenever singular, because the tooling expects at least 2 values (ugg) + if (row.size() == 1) { + row.add(value); + } + result.add(row.toArray(new String[row.size()])); + } + + return result.toArray(new String[result.size()][]); } private ScriptTester(UnicodeMap character_scripts) { diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java b/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java index 9f7926b10..047ca3790 100644 --- a/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java +++ b/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java @@ -150,6 +150,7 @@ public MySymbolTable() { // return null; // } + @Override public boolean applyPropertyAlias( String propertyName, String propertyValue, UnicodeSet result) { boolean status = false; @@ -201,9 +202,11 @@ public boolean applyPropertyAlias( } ; if (!status) { - try { - status = applyPropertyAlias0(prop, "No", result, !invert); - } catch (Exception e) { + if (prop.isType(UnicodeProperty.BINARY_OR_ENUMERATED_OR_CATALOG_MASK)) { + try { + status = applyPropertyAlias0(prop, "No", result, !invert); + } catch (Exception e) { + } } ; if (!status) { @@ -336,6 +339,7 @@ public ComparisonMatcher(String pattern, Relation comparator) { this.pattern = pattern; } + @Override public boolean test(String value) { int comp = comparator.compare(pattern, value.toString()); switch (relation) { @@ -352,6 +356,7 @@ public boolean test(String value) { } } + @Override public PatternMatcher set(String pattern) { this.pattern = pattern; return this; diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/XPropertyFactory.java b/UnicodeJsps/src/main/java/org/unicode/jsp/XPropertyFactory.java index b7232c295..74fde352c 100644 --- a/UnicodeJsps/src/main/java/org/unicode/jsp/XPropertyFactory.java +++ b/UnicodeJsps/src/main/java/org/unicode/jsp/XPropertyFactory.java @@ -269,15 +269,18 @@ public String transform(Integer source) { // set up the special script property UnicodeProperty scriptProp = base.getProperty("sc"); + + // Compose the function and add UnicodeMap specialMap = new UnicodeMap(); - specialMap.putAll(scriptProp.getUnicodeMap()); + specialMap.putAll( + scriptProp.getUnicodeMap()); // if there is no value, use the script property specialMap.putAll(ScriptTester.getScriptSpecialsNames()); add( new UnicodeProperty.UnicodeMapProperty() .set(specialMap) .setMain("Script_Extensions", "scx", UnicodeProperty.ENUMERATED, "1.1") .addValueAliases( - ScriptTester.getScriptSpecialsAlternates(), + ScriptTester.getScriptSpecialsAlternates(scriptProp), AliasAddAction.IGNORE_IF_MISSING) .setMultivalued(true)); @@ -359,6 +362,7 @@ private void addExamplarProperty( // convert to UnicodeMap UnicodeMap unicodeMap = new UnicodeMap<>(); + unicodeMap.putAll(0, 0x10FFFF, ""); // default is empty string for (Entry> entry : data.asMap().entrySet()) { String value = JOIN_COMMAS.join(entry.getValue()).intern(); unicodeMap.put(entry.getKey(), value); @@ -383,11 +387,7 @@ private void addExamplarProperty( add( new UnicodeProperty.UnicodeMapProperty() .set(unicodeMap) - .setMain( - propertyName, - propertyAbbreviation, - UnicodeProperty.ENUMERATED, - "1.1") + .setMain(propertyName, propertyAbbreviation, UnicodeProperty.STRING, "1.1") .addValueAliases(locales, AliasAddAction.ADD_MAIN_ALIAS) .setMultivalued(true)); } diff --git a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestMultivalued.java b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestMultivalued.java index 8ed9706ef..903eb3112 100644 --- a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestMultivalued.java +++ b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestMultivalued.java @@ -3,11 +3,23 @@ import com.ibm.icu.text.UnicodeSet; import org.junit.jupiter.api.Test; import org.unicode.jsp.UnicodeSetUtilities; +import org.unicode.jsp.XPropertyFactory; +import org.unicode.props.UnicodeProperty; import org.unicode.unittest.TestFmwkMinusMinus; public class TestMultivalued extends TestFmwkMinusMinus { + + private static final boolean DEBUG = false; + + UnicodeProperty exemplarProp = XPropertyFactory.make().getProperty("exemplar"); + UnicodeProperty scxProp = XPropertyFactory.make().getProperty("scx"); + @Test public void TestScx1Script() { + if (DEBUG) { + String x = scxProp.getValue('।'); + } + String unicodeSetString = "\\p{scx=deva}"; UnicodeSet parsed = UnicodeSetUtilities.parseUnicodeSet(unicodeSetString); @@ -20,6 +32,20 @@ public void TestScx1Script() { parsed.containsAll(mustNotContain)); } + @Test + public void TestScx1ScriptB() { + String unicodeSetString = "\\p{scx=Arab}"; + UnicodeSet parsed = UnicodeSetUtilities.parseUnicodeSet(unicodeSetString); + + UnicodeSet mustContain = new UnicodeSet("[،ء]"); // one character single script, one multi + assertTrue(unicodeSetString + " contains " + mustContain, parsed.containsAll(mustContain)); + + UnicodeSet mustNotContain = new UnicodeSet("[ক]"); // one Bangla character + assertFalse( + unicodeSetString + " !contains " + mustNotContain, + parsed.containsAll(mustNotContain)); + } + @Test public void TestScxMulti() { String unicodeSetString = "\\p{scx=beng,deva}"; @@ -37,6 +63,10 @@ public void TestScxMulti() { @Test public void TestExemplars() { + if (DEBUG) { + String x = exemplarProp.getValue('æ'); + } + String unicodeSetString = "\\p{exem=da}"; UnicodeSet parsed = UnicodeSetUtilities.parseUnicodeSet(unicodeSetString); @@ -48,4 +78,31 @@ public void TestExemplars() { unicodeSetString + " !contains " + mustNotContain, parsed.containsAll(mustNotContain)); } + + @Test + public void TestEmpty() { + assertEquals("exemplar(0x0000)", "", exemplarProp.getValue(0x0000)); + assertEquals("exemplar(α)", "el", exemplarProp.getValue('α')); + + UnicodeSet exem = UnicodeSetUtilities.parseUnicodeSet("\\p{exem}"); + assertTrue("\\p{exem} contains 0", exem.contains(0x0000)); + assertFalse("\\p{exem} contains α", exem.contains('α')); + UnicodeSet exem3 = UnicodeSetUtilities.parseUnicodeSet("\\p{exem=el}"); + assertFalse("\\p{exem=el} contains 0", exem3.contains(0x0000)); + assertTrue("\\p{exem=el} contains α", exem3.contains('α')); + + String unicodeSetString = "[\\p{Greek}&\\p{exem}]"; + UnicodeSet parsed = UnicodeSetUtilities.parseUnicodeSet(unicodeSetString); + + String first = parsed.iterator().next(); + String firstValue = exemplarProp.getValue(first.codePointAt(0)); + assertEquals(unicodeSetString, "", firstValue); + + String unicodeSetString2 = "[\\p{Greek}&\\P{exem}]"; + UnicodeSet parsed2 = UnicodeSetUtilities.parseUnicodeSet(unicodeSetString2); + + String first2 = parsed2.iterator().next(); + String firstValue2 = exemplarProp.getValue(first2.codePointAt(0)); + assertEquals(unicodeSetString2, "el", firstValue2); + } } diff --git a/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java b/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java index 615986a7a..205a9bfc3 100644 --- a/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java +++ b/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java @@ -190,7 +190,14 @@ public UnicodeProperty setMultivalued(boolean value) { (1 << ENUMERATED) | (1 << EXTENDED_ENUMERATED) | (1 << CATALOG) - | (1 << EXTENDED_CATALOG); + | (1 << EXTENDED_CATALOG), + BINARY_OR_ENUMERATED_OR_CATALOG_MASK = + (1 << ENUMERATED) + | (1 << EXTENDED_ENUMERATED) + | (1 << CATALOG) + | (1 << EXTENDED_CATALOG) + | (1 << BINARY) + | (1 << EXTENDED_BINARY); private static final String[] TYPE_NAMES = { "Unknown", @@ -405,7 +412,7 @@ public final UnicodeSet getSet(String propertyValue, UnicodeSet result) { public UnicodeSet getSet(PatternMatcher matcher, UnicodeSet result) { if (result == null) result = new UnicodeSet(); boolean uniformUnassigned = hasUniformUnassigned(); - if (isType(STRING_OR_MISC_MASK)) { + if (isType(STRING_OR_MISC_MASK) && !isMultivalued) { for (UnicodeSetIterator usi = getStuffToTest(uniformUnassigned); usi.next(); ) { // int i = 0; i <= 0x10FFFF; ++i int i = usi.codepoint; @@ -423,7 +430,8 @@ public UnicodeSet getSet(PatternMatcher matcher, UnicodeSet result) { while (it.hasNext()) { String value = it.next(); temp.clear(); - Iterator it2 = getValueAliases(value, temp).iterator(); + final List valueAliases = getValueAliases(value, temp); + Iterator it2 = valueAliases.iterator(); while (it2.hasNext()) { String value2 = it2.next(); // System.out.println("Values:" + value2);