unicode-org · macchiati · Nov 28, 2023 · Nov 24, 2023 · Nov 25, 2023 · Nov 25, 2023
diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/XPropertyFactory.java b/UnicodeJsps/src/main/java/org/unicode/jsp/XPropertyFactory.java
@@ -1,6 +1,10 @@
 package org.unicode.jsp;
 
+import com.google.common.base.Joiner;
+import com.google.common.collect.Multimap;
+import com.google.common.collect.TreeMultimap;
 import com.ibm.icu.dev.util.UnicodeMap;
+import com.ibm.icu.dev.util.UnicodeMap.EntryRange;
 import com.ibm.icu.lang.CharSequences;
 import com.ibm.icu.lang.UCharacter;
 import com.ibm.icu.lang.UProperty.NameChoice;
@@ -12,13 +16,19 @@
 import com.ibm.icu.text.Transform;
 import com.ibm.icu.text.UTF16;
 import com.ibm.icu.text.UnicodeSet;
+import com.ibm.icu.text.UnicodeSetIterator;
+import com.ibm.icu.util.LocaleData;
 import com.ibm.icu.util.ULocale;
 import com.ibm.icu.util.VersionInfo;
 import java.nio.charset.Charset;
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.Collection;
 import java.util.List;
 import java.util.Locale;
+import java.util.Map.Entry;
+import java.util.Set;
+import java.util.TreeSet;
 import org.unicode.idna.Idna.IdnaType;
 import org.unicode.idna.Idna2003;
 import org.unicode.idna.Idna2008;
@@ -28,9 +38,13 @@
 import org.unicode.props.UnicodeProperty.BaseProperty;
 import org.unicode.props.UnicodeProperty.Factory;
 import org.unicode.props.UnicodeProperty.SimpleProperty;
+import org.unicode.text.utility.Utility;
 
 public class XPropertyFactory extends UnicodeProperty.Factory {
 
+    private static final Joiner JOIN_COMMAS = Joiner.on(",");
+    private static final boolean DEBUG_MULTI = false;
+
     static final UnicodeSet ALL =
             new UnicodeSet("[[:^C:][:Cc:][:Cf:][:noncharactercodepoint:]]").freeze();
 
@@ -96,6 +110,7 @@ public final Factory add2(UnicodeProperty sp) {
         add(
                 new CodepointTransformProperty(
                                 new Transform<Integer, String>() {
+                                    @Override
                                     public String transform(Integer source) {
                                         return Normalizer.normalize(source, Normalizer.NFC);
                                     }
@@ -105,6 +120,7 @@ public String transform(Integer source) {
         add(
                 new CodepointTransformProperty(
                                 new Transform<Integer, String>() {
+                                    @Override
                                     public String transform(Integer source) {
                                         return Normalizer.normalize(source, Normalizer.NFD);
                                     }
@@ -114,6 +130,7 @@ public String transform(Integer source) {
         add(
                 new CodepointTransformProperty(
                                 new Transform<Integer, String>() {
+                                    @Override
                                     public String transform(Integer source) {
                                         return Normalizer.normalize(source, Normalizer.NFKC);
                                     }
@@ -123,6 +140,7 @@ public String transform(Integer source) {
         add(
                 new CodepointTransformProperty(
                                 new Transform<Integer, String>() {
+                                    @Override
                                     public String transform(Integer source) {
                                         return Normalizer.normalize(source, Normalizer.NFKD);
                                     }
@@ -133,6 +151,7 @@ public String transform(Integer source) {
         add(
                 new StringTransformProperty(
                                 new StringTransform() {
+                                    @Override
                                     public String transform(String source) {
                                         return UCharacter.foldCase(source, true);
                                     }
@@ -142,6 +161,7 @@ public String transform(String source) {
         add(
                 new StringTransformProperty(
                                 new StringTransform() {
+                                    @Override
                                     public String transform(String source) {
                                         return UCharacter.toLowerCase(ULocale.ROOT, source);
                                     }
@@ -151,6 +171,7 @@ public String transform(String source) {
         add(
                 new StringTransformProperty(
                                 new StringTransform() {
+                                    @Override
                                     public String transform(String source) {
                                         return UCharacter.toUpperCase(ULocale.ROOT, source);
                                     }
@@ -160,6 +181,7 @@ public String transform(String source) {
         add(
                 new StringTransformProperty(
                                 new StringTransform() {
+                                    @Override
                                     public String transform(String source) {
                                         return UCharacter.toTitleCase(ULocale.ROOT, source, null);
                                     }
@@ -170,6 +192,7 @@ public String transform(String source) {
         add(
                 new StringTransformProperty(
                                 new StringTransform() {
+                                    @Override
                                     public String transform(String source) {
                                         StringBuilder b = new StringBuilder();
                                         for (int cp : CharSequences.codePoints(source)) {
@@ -184,6 +207,7 @@ public String transform(String source) {
         add(
                 new StringTransformProperty(
                                 new StringTransform() {
+                                    @Override
                                     public String transform(String source) {
                                         String result = NFM.nfm.get(source);
                                         return result == null ? source : result;
@@ -201,6 +225,7 @@ public String transform(String source) {
         add(
                 new CodepointTransformProperty(
                                 new Transform<Integer, String>() {
+                                    @Override
                                     public String transform(Integer source) {
                                         return UnicodeUtilities.getSubheader().getSubheader(source);
                                     }
@@ -239,6 +264,9 @@ public String transform(Integer source) {
                         .setMain("bmp", "bmp", UnicodeProperty.BINARY, "6.0"));
 
         addCollationProperty();
+        addExamplarProperty(LocaleData.ES_STANDARD, "exem", "exemplar");
+        addExamplarProperty(LocaleData.ES_AUXILIARY, "exema", "exemplar_aux");
+        addExamplarProperty(LocaleData.ES_PUNCTUATION, "exemp", "exemplar_punct");
 
         // set up the special script property
         UnicodeProperty scriptProp = base.getProperty("sc");
@@ -251,7 +279,8 @@ public String transform(Integer source) {
                         .setMain("Script_Extensions", "scx", UnicodeProperty.ENUMERATED, "1.1")
                         .addValueAliases(
                                 ScriptTester.getScriptSpecialsAlternates(),
-                                AliasAddAction.IGNORE_IF_MISSING));
+                                AliasAddAction.IGNORE_IF_MISSING)
+                        .setMultivalued(true));
 
         CachedProps cp = CachedProps.CACHED_PROPS;
         for (String prop : cp.getAvailable()) {
@@ -289,6 +318,81 @@ public String transform(Integer source) {
                         .setMain("RGI_Emoji", "RGI_Emoji", UnicodeProperty.BINARY, "13.0"));
     }
 
+    private void addExamplarProperty(
+            int exemplarType, String propertyAbbreviation, String propertyName) {
+        Multimap<Integer, String> data = TreeMultimap.create();
+        Set<String> localeSet = new TreeSet<>();
+
+        for (ULocale ulocale : ULocale.getAvailableLocales()) {
+            if (!ulocale.getCountry().isEmpty()) {
+                continue;
+                // we want to skip cases where characters are in the parent locale, but there is no
+                // ULocale parentLocale = ulocale.getParent();
+            }
+            UnicodeSet exemplarSet = LocaleData.getExemplarSet(ulocale, 0, exemplarType);
+            if (!ulocale.getScript().isEmpty()) {
+                // we can't find out the parent locale or defaultContent locale in ICU, so we hack
+                // it
+                String langLocale = ulocale.getLanguage();
+                UnicodeSet langExemplarSet =
+                        LocaleData.getExemplarSet(new ULocale(langLocale), 0, exemplarType);
+                if (langExemplarSet.equals(exemplarSet)) {
+                    continue;
+                }
+            }
+            String locale = ulocale.toString();
+            localeSet.add(locale);
+            for (UnicodeSetIterator it = new UnicodeSetIterator(exemplarSet); it.nextRange(); ) {
+                if (it.codepoint == UnicodeSetIterator.IS_STRING) {
+                    // flatten
+                    int cp = 0;
+                    for (int i = 0; i < it.string.length(); i += Character.charCount(cp)) {
+                        cp = it.string.codePointAt(i);
+                        data.put(cp, locale);
+                    }
+                } else {
+                    for (int cp = it.codepoint; cp <= it.codepointEnd; ++cp) {
+                        data.put(cp, locale);
+                    }
+                }
+            }
+        }
+
+        // convert to UnicodeMap
+        UnicodeMap<String> unicodeMap = new UnicodeMap<>();
+        for (Entry<Integer, Collection<String>> entry : data.asMap().entrySet()) {
+            String value = JOIN_COMMAS.join(entry.getValue()).intern();
+            unicodeMap.put(entry.getKey(), value);
+        }
+        if (DEBUG_MULTI) {
+            System.out.println("\n" + propertyName);
+            for (EntryRange<String> entry : unicodeMap.entryRanges()) {
+                System.out.println(
+                        Utility.hex(entry.codepoint)
+                                + (entry.codepoint == entry.codepointEnd
+                                        ? ""
+                                        : "-" + Utility.hex(entry.codepointEnd))
+                                + " ;\t"
+                                + entry.value);
+            }
+        }
+
+        // put locales into right format
+        String[] localeList = localeSet.toArray(new String[localeSet.size()]);
+        String[][] locales = new String[][] {localeList, localeList}; // abbreviations are the same
+
+        add(
+                new UnicodeProperty.UnicodeMapProperty()
+                        .set(unicodeMap)
+                        .setMain(
+                                propertyName,
+                                propertyAbbreviation,
+                                UnicodeProperty.ENUMERATED,
+                                "1.1")
+                        .addValueAliases(locales, AliasAddAction.ADD_MAIN_ALIAS)
+                        .setMultivalued(true));
+    }
+
     private void addCollationProperty() {
         RuleBasedCollator c = UnicodeSetUtilities.RAW_COLLATOR;
         // (RuleBasedCollator) Collator.getInstance(ULocale.ROOT);
@@ -652,6 +756,7 @@ public StringTransformProperty(
             setUniformUnassigned(hasUniformUnassigned);
         }
 
+        @Override
         protected String _getValue(int codepoint) {
             return transform.transform(UTF16.valueOf(codepoint));
         }
@@ -666,6 +771,7 @@ public CodepointTransformProperty(
             setUniformUnassigned(hasUniformUnassigned);
         }
 
+        @Override
         protected String _getValue(int codepoint) {
             return transform.transform(codepoint);
         }
@@ -682,6 +788,7 @@ public static class EncodingProperty extends SimpleProperty {
             encoder = new CharEncoder(charset, false, false);
         }
 
+        @Override
         protected String _getValue(int codepoint) {
             int len = encoder.getValue(codepoint, temp, 0);
             if (len < 0) {
@@ -697,6 +804,7 @@ protected String _getValue(int codepoint) {
             return result.toString();
         }
 
+        @Override
         public boolean isDefault(int codepoint) {
             int len = encoder.getValue(codepoint, temp, 0);
             return len < 0;
@@ -716,6 +824,7 @@ public static class EncodingPropertyBoolean extends SimpleProperty {
             encoder = new CharEncoder(charset, true, true);
         }
 
+        @Override
         protected String _getValue(int codepoint) {
             return (encoder.getValue(codepoint, null, 0) > 0) ? "Yes" : "No";
         }
@@ -731,6 +840,7 @@ public XPropertyFactory.UnicodeSetProperty set(UnicodeSet set) {
             return this;
         }
 
+        @Override
         protected UnicodeMap<String> _getUnicodeMap() {
             UnicodeMap<String> result = new UnicodeMap<String>();
             result.putAll(unicodeSet, "Yes");
@@ -743,10 +853,12 @@ public XPropertyFactory.UnicodeSetProperty set(String string) {
             return set(new UnicodeSet(string).freeze());
         }
 
+        @Override
         protected String _getValue(int codepoint) {
             return YESNO_ARRAY[unicodeSet.contains(codepoint) ? 0 : 1];
         }
 
+        @Override
         protected List _getAvailableValues(List result) {
             return YESNO;
         }

diff --git a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestMultivalued.java b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestMultivalued.java
@@ -0,0 +1,53 @@
+package org.unicode.jsptest;
+
+import com.ibm.icu.text.UnicodeSet;
+import org.junit.jupiter.api.Test;
+import org.unicode.jsp.UnicodeSetUtilities;
+import org.unicode.unittest.TestFmwkMinusMinus;
+
+public class TestMultivalued extends TestFmwkMinusMinus {
+    @Test
+    public void TestScx1Script() {
+        // As of 2023-11-24, scx was not working properly
+        String unicodeSetString = "\\p{scx=deva}";
+        UnicodeSet parsed = UnicodeSetUtilities.parseUnicodeSet(unicodeSetString);
+
+        UnicodeSet mustContain = new UnicodeSet("[ᳵ।]"); // one character B&D, other B&D&D&G&...
+        assertTrue(unicodeSetString + " contains " + mustContain, parsed.containsAll(mustContain));
+
+        UnicodeSet mustNotContain = new UnicodeSet("[ক]"); // one Bengali character
+        assertFalse(
+                unicodeSetString + " !contains " + mustNotContain,
+                parsed.containsAll(mustNotContain));
+    }
+
+    @Test
+    public void TestScxMulti() {
+        // As of 2023-11-24, scx was not working properly
+        String unicodeSetString = "\\p{scx=beng,deva}";
+        String exceptionMessage = null;
+        try {
+            UnicodeSet parsed = UnicodeSetUtilities.parseUnicodeSet(unicodeSetString);
+        } catch (Exception e) {
+            exceptionMessage = e.getMessage();
+        }
+        assertEquals(
+                "Expected exception",
+                "Multivalued property values can't contain commas.",
+                exceptionMessage);
+    }
+
+    @Test
+    public void TestExemplars() {
+        String unicodeSetString = "\\p{exem=da}";
+        UnicodeSet parsed = UnicodeSetUtilities.parseUnicodeSet(unicodeSetString);
+
+        UnicodeSet mustContain = new UnicodeSet("[æ]");
+        assertTrue(unicodeSetString + " contains " + mustContain, parsed.containsAll(mustContain));
+
+        UnicodeSet mustNotContain = new UnicodeSet("[ç]");
+        assertFalse(
+                unicodeSetString + " !contains " + mustNotContain,
+                parsed.containsAll(mustNotContain));
+    }
+}
diff --git a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java
@@ -413,14 +413,6 @@ public void TestPerMill(final String name, final Charset charset) {
         }
     }
 
-    @Test
-    public void TestScriptSpecials() {
-        //        UnicodeSet set = UnicodeSetUtilities.parseUnicodeSet("[:scs=Hant:]");
-        //        assertNotEquals("Hant", 0, set.size());
-        UnicodeSet set2 = UnicodeSetUtilities.parseUnicodeSet("[:scx=Arab,Syrc:]");
-        assertNotEquals("Arab Syrc", 0, set2.size());
-    }
-
     @Test
     public void TestGC() {
         Map<String, R2<String, UnicodeSet>> SPECIAL_GC =