From f8ec0531b65c1e1793f979a19afa03301dd118f2 Mon Sep 17 00:00:00 2001 From: macchiati Date: Sat, 17 Aug 2024 12:14:59 -0700 Subject: [PATCH] CLDR-17535 partial changes for test failures, more to come --- common/properties/scriptMetadata.txt | 2 +- common/supplemental/likelySubtags.xml | 541 +++++++++++++++++- common/supplemental/supplementalData.xml | 2 +- .../unicode/cldr/draft/ScriptMetadata.java | 43 ++ .../cldr/tool/GenerateLikelySubtags.java | 361 ++++++++---- .../unicode/cldr/util/LocaleValidator.java | 18 +- .../cldr/util/data/Script_Metadata.csv | 4 +- .../util/data/country_language_population.tsv | 2 +- .../cldr/unittest/LikelySubtagsTest.java | 125 ++-- .../org/unicode/cldr/unittest/TestLocale.java | 12 +- 10 files changed, 885 insertions(+), 225 deletions(-) diff --git a/common/properties/scriptMetadata.txt b/common/properties/scriptMetadata.txt index 0ac8ab4bc79..9dee6c4680c 100644 --- a/common/properties/scriptMetadata.txt +++ b/common/properties/scriptMetadata.txt @@ -180,7 +180,7 @@ Onao; 33; 1E5D0; IN; 1; EXCLUSION; NO; NO; MIN; NO; NO # provisional data for f Orkh; 33; 10C00; MN; 1; EXCLUSION; YES; NO; NO; NO; NO Osge; 33; 104B5; US; 1; LIMITED_USE; NO; NO; NO; NO; YES Osma; 33; 10480; SO; 1; EXCLUSION; NO; NO; NO; NO; NO -Ougr; 33; 10F7C; 143; 1; EXCLUSION; YES; NO; YES; NO; NO +Ougr; 33; 10F7C; CN; 1; EXCLUSION; YES; NO; YES; NO; NO Palm; 33; 10873; SY; 1; EXCLUSION; YES; NO; NO; NO; NO Pauc; 33; 11AC0; MM; 1; EXCLUSION; NO; NO; NO; NO; NO Perm; 33; 1036B; RU; 1; EXCLUSION; NO; NO; NO; NO; NO diff --git a/common/supplemental/likelySubtags.xml b/common/supplemental/likelySubtags.xml index 224f6f4c514..65c3f7002ba 100644 --- a/common/supplemental/likelySubtags.xml +++ b/common/supplemental/likelySubtags.xml @@ -20,15 +20,12 @@ not be patched by hand, as any changes made in that fashion may be lost. - - - @@ -42,6 +39,7 @@ not be patched by hand, as any changes made in that fashion may be lost. + @@ -67,6 +65,7 @@ not be patched by hand, as any changes made in that fashion may be lost. + @@ -88,7 +87,6 @@ not be patched by hand, as any changes made in that fashion may be lost. - @@ -99,7 +97,6 @@ not be patched by hand, as any changes made in that fashion may be lost. - @@ -164,7 +161,6 @@ not be patched by hand, as any changes made in that fashion may be lost. - @@ -176,7 +172,6 @@ not be patched by hand, as any changes made in that fashion may be lost. - @@ -240,7 +235,6 @@ not be patched by hand, as any changes made in that fashion may be lost. - @@ -271,7 +265,6 @@ not be patched by hand, as any changes made in that fashion may be lost. - @@ -279,6 +272,7 @@ not be patched by hand, as any changes made in that fashion may be lost. + @@ -321,7 +315,7 @@ not be patched by hand, as any changes made in that fashion may be lost. - + @@ -381,7 +375,6 @@ not be patched by hand, as any changes made in that fashion may be lost. - @@ -392,7 +385,6 @@ not be patched by hand, as any changes made in that fashion may be lost. - @@ -401,8 +393,6 @@ not be patched by hand, as any changes made in that fashion may be lost. - - @@ -554,7 +544,7 @@ not be patched by hand, as any changes made in that fashion may be lost. - + @@ -578,12 +568,10 @@ not be patched by hand, as any changes made in that fashion may be lost. - - @@ -655,13 +643,11 @@ not be patched by hand, as any changes made in that fashion may be lost. - - @@ -688,7 +674,6 @@ not be patched by hand, as any changes made in that fashion may be lost. - @@ -702,7 +687,6 @@ not be patched by hand, as any changes made in that fashion may be lost. - @@ -724,7 +708,6 @@ not be patched by hand, as any changes made in that fashion may be lost. - @@ -737,7 +720,6 @@ not be patched by hand, as any changes made in that fashion may be lost. - @@ -856,21 +838,25 @@ not be patched by hand, as any changes made in that fashion may be lost. - + + + + + @@ -879,31 +865,41 @@ not be patched by hand, as any changes made in that fashion may be lost. + + + + + + + + + + @@ -915,21 +911,31 @@ not be patched by hand, as any changes made in that fashion may be lost. - + + + + + + + + + + + @@ -937,26 +943,36 @@ not be patched by hand, as any changes made in that fashion may be lost. + + + + + + + + + + @@ -968,26 +984,35 @@ not be patched by hand, as any changes made in that fashion may be lost. + + + + + + + + + @@ -997,21 +1022,26 @@ not be patched by hand, as any changes made in that fashion may be lost. + + + + + @@ -1023,7 +1053,11 @@ not be patched by hand, as any changes made in that fashion may be lost. + + + + @@ -1035,15 +1069,22 @@ not be patched by hand, as any changes made in that fashion may be lost. + + + + + + + @@ -1051,35 +1092,100 @@ not be patched by hand, as any changes made in that fashion may be lost. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -1087,10 +1193,15 @@ not be patched by hand, as any changes made in that fashion may be lost. + + + + + @@ -1098,49 +1209,128 @@ not be patched by hand, as any changes made in that fashion may be lost. + + + + + + + + + + + + + + + + + + + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + + + + + + + + + + + + + + + @@ -1148,48 +1338,299 @@ not be patched by hand, as any changes made in that fashion may be lostnot be patched by hand, as any changes made in that fashion may be lost. + + + + + + + + + + @@ -1216,15 +1667,20 @@ not be patched by hand, as any changes made in that fashion may be lost. + + + + + - + @@ -1236,16 +1692,21 @@ not be patched by hand, as any changes made in that fashion may be lost. + + + + + @@ -1254,22 +1715,44 @@ not be patched by hand, as any changes made in that fashion may be lost. + + + + + + + + + + + + + + + + + + + + + + @@ -1277,6 +1760,7 @@ not be patched by hand, as any changes made in that fashion may be lost. + @@ -1284,6 +1768,7 @@ not be patched by hand, as any changes made in that fashion may be lost. + diff --git a/common/supplemental/supplementalData.xml b/common/supplemental/supplementalData.xml index ac6f516ee3f..7ddde428e6c 100644 --- a/common/supplemental/supplementalData.xml +++ b/common/supplemental/supplementalData.xml @@ -4226,7 +4226,7 @@ XXX Code for transations where no currency is involved - + diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/draft/ScriptMetadata.java b/tools/cldr-code/src/main/java/org/unicode/cldr/draft/ScriptMetadata.java index f4ec6489ca0..f7f56941980 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/draft/ScriptMetadata.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/draft/ScriptMetadata.java @@ -23,6 +23,9 @@ import org.unicode.cldr.util.Containment; import org.unicode.cldr.util.SemiFileReader; import org.unicode.cldr.util.StandardCodes; +import org.unicode.cldr.util.StandardCodes.LstrType; +import org.unicode.cldr.util.Validity; +import org.unicode.cldr.util.Validity.Status; import org.unicode.cldr.util.With; public class ScriptMetadata { @@ -139,6 +142,12 @@ public static void addNameToCode(String type, Map hashMap) { public static final class SkipNewUnicodeException extends ICUException {} + /** + * Scripts that either have no known languages as yet (Cpmn) or are used for any language + * (Brai). + */ + public static final Set SCRIPTS_WITH_NO_LANGUAGES = Set.of("Brai", "Cpmn"); + public static class Info implements Comparable { public final int rank; public final VersionInfo age; @@ -173,6 +182,7 @@ private Info(String[] items) { ime = trinaryLookup.forString(Column.IME.getItem(items)); hasCase = trinaryLookup.forString(Column.HAS_CASE.getItem(items)); density = Column.DENSITY.getInt(items, -1); + String script = items[2]; final String countryRaw = Column.ORIGIN_COUNTRY.getItem(items); String country = CountryCodeConverter.getCodeFromName(countryRaw, false); @@ -191,6 +201,39 @@ private Info(String[] items) { langCode = null; } likelyLanguage = langCode == null ? "und" : langCode; + + // check for bad countries, bad languages + + final Status scriptStatus = + Validity.getInstance().getCodeToStatus(LstrType.script).get(script); + if (!(scriptStatus == Status.special || scriptStatus == Status.unknown)) { + final Status countryStatus = + Validity.getInstance().getCodeToStatus(LstrType.region).get(originCountry); + if (countryStatus != Status.regular) { + errors.add( + "ScriptMetadata.java: the country (" + + originCountry + + ") for " + + script + + " is not valid: " + + countryStatus); + } + final Status languageStatus = + Validity.getInstance() + .getCodeToStatus(LstrType.language) + .get(likelyLanguage); + if (languageStatus != Status.regular + // make exception for scripts that has no known languages + && !SCRIPTS_WITH_NO_LANGUAGES.contains(script)) { + errors.add( + "ScriptMetadata.java: the likely language (" + + likelyLanguage + + ") for " + + script + + " is not valid: " + + languageStatus); + } + } } public Info(Info other, String string, String sampleCharacter) { diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateLikelySubtags.java b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateLikelySubtags.java index a1ffc587e88..61556319f9b 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateLikelySubtags.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateLikelySubtags.java @@ -6,6 +6,7 @@ import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Sets; import com.ibm.icu.impl.Relation; import com.ibm.icu.impl.Row; import com.ibm.icu.impl.Row.R2; @@ -19,6 +20,7 @@ import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; +import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Map.Entry; @@ -44,6 +46,7 @@ import org.unicode.cldr.util.LanguageTagParser; import org.unicode.cldr.util.LocaleNames; import org.unicode.cldr.util.LocaleScriptInfo; +import org.unicode.cldr.util.LocaleValidator; import org.unicode.cldr.util.SimpleFactory; import org.unicode.cldr.util.StandardCodes; import org.unicode.cldr.util.StandardCodes.LstrType; @@ -65,6 +68,8 @@ public class GenerateLikelySubtags { private static final Map LANGUAGE_CODE_TO_STATUS = Validity.getInstance().getCodeToStatus(LstrType.language); + private static final Map SCRIPT_CODE_TO_STATUS = + Validity.getInstance().getCodeToStatus(LstrType.script); private static final String TEMP_UNKNOWN_REGION = "XZ"; @@ -161,7 +166,6 @@ public static void main(String[] args) throws IOException { jsonErrors.printAll(); } - Map old = supplementalData.getLikelySubtags(); Map oldOrigins = supplementalData.getLikelyOrigins(); System.out.println("origins: " + new TreeSet<>(oldOrigins.values())); @@ -171,6 +175,22 @@ public static void main(String[] args) throws IOException { Map result = minimize(toMaximized, itemsRemoved); + // Verify that the minimized version produces the same results + + LikelySubtags max = new LikelySubtags(toMaximized); + LikelySubtags min = new LikelySubtags(result); + + Map minFailures = new TreeMap<>(LOCALE_SOURCE); + for (Entry entry : toMaximized.entrySet()) { + String source = entry.getKey(); + String target = entry.getValue(); + String minTarget = min.maximize(source); + if (!target.equals(minTarget)) { + minFailures.put(source, target); + System.out.println(JOIN_TAB.join("Failure: ", source, target, minTarget)); + } + } + Set newAdditions = new TreeSet(); Set newMissing = new TreeSet(); @@ -178,33 +198,30 @@ public static void main(String[] args) throws IOException { System.out.println(JOIN_TAB.join("Source", "Name", "oldValue", "Name", "newValue", "Name")); + final SupplementalDataInfo oldSupplementalInfo = + SupplementalDataInfo.getInstance( + CldrUtility.getPath(CLDRPaths.LAST_COMMON_DIRECTORY, "supplemental/")); + final Map oldLikelyData = oldSupplementalInfo.getLikelySubtags(); + final Map oldLikelyOrigins = oldSupplementalInfo.getLikelyOrigins(); + LikelySubtags oldLikely = new LikelySubtags(oldLikelyData); + Set sorted = new TreeSet<>(LOCALE_SOURCE); sorted.addAll(result.keySet()); - sorted.addAll(old.keySet()); + sorted.addAll(oldLikelyData.keySet()); for (String source : sorted) { - String oldValue = old.get(source); - String newValue = result.get(source); - String removal = itemsRemoved.get(source); - - if (newValue == null) { - LSRSource silValue = silData.get(source); - if (silValue != null) { - newValue = silValue.getLsrString(); - } + String oldValue = oldLikely.maximize(source); + String oldOrigin = oldLikelyOrigins.get(source); + if (oldOrigin != null && oldOrigin.contains("sil1")) { + continue; // we don't control variations in sil data } + String newValue = min.maximize(source); + String removal = itemsRemoved.get(source); if (Objects.equal(oldValue, newValue)) { continue; } - // SKIP the sil values; those will be recreated - - final String origins = oldOrigins.get(source); - if (origins != null && origins.contains("sil1")) { - continue; // skip for now - } - // skip new values, or oldValues that are specifically removed if (oldValue == null || oldValue.equals(removal)) { @@ -247,7 +264,7 @@ public static void main(String[] args) throws IOException { } private static final List KEEP_TARGETS = - DROP_HARDCODED ? List.of() : List.of("und_Arab_PK", "und_Latn_ET", "hi_Latn"); + DROP_HARDCODED ? List.of() : List.of("und_Arab_PK", "und_Latn_ET"); private static final ImmutableSet deprecatedISONotInLST = DROP_HARDCODED ? ImmutableSet.of() : ImmutableSet.of("scc", "scr"); @@ -295,7 +312,6 @@ public static void main(String[] args) throws IOException { "ojs_Cans_CA", "oka_Latn_CA", "pqm_Latn_CA", - "hi_Latn_IN", "no_Latn_NO", "tok_Latn_001", "prg_Latn_PL", @@ -306,7 +322,9 @@ public static void main(String[] args) throws IOException { * results. Safer is to add to MAX_ADDITIONS. However, if you add, add both the language and * language+script mappings. */ + // Many of the overrides below can be removed once the language/pop/country data is updated. + private static final Map LANGUAGE_OVERRIDES = CldrUtility.asMap( DROP_HARDCODED @@ -361,7 +379,7 @@ public static void main(String[] args) throws IOException { {"sr_Latn", "sr_Latn_RS"}, {"ss", "ss_Latn_ZA"}, {"ss_Latn", "ss_Latn_ZA"}, - {"swc", "swc_Latn_CD"}, + // {"swc", "swc_Latn_CD"}, {"ti", "ti_Ethi_ET"}, {"ti_Ethi", "ti_Ethi_ET"}, {LocaleNames.UND, "en_Latn_US"}, @@ -371,7 +389,6 @@ public static void main(String[] args) throws IOException { {"und_Arab_PK", "ur_Arab_PK"}, {"und_Bopo", "zh_Bopo_TW"}, {"und_Deva_FJ", "hif_Deva_FJ"}, - {"und_EZ", "de_Latn_EZ"}, {"und_Hani", "zh_Hani_CN"}, {"und_Hani_CN", "zh_Hani_CN"}, {"und_Kana", "ja_Kana_JP"}, @@ -393,8 +410,6 @@ public static void main(String[] args) throws IOException { {"und_SO", "so_Latn_SO"}, {"und_SS", "en_Latn_SS"}, {"und_TK", "tkl_Latn_TK"}, - {"und_UN", "en_Latn_UN"}, - {"und_005", "pt_Latn_BR"}, {"vo", "vo_Latn_001"}, {"vo_Latn", "vo_Latn_001"}, // {"yi", "yi_Hebr_001"}, @@ -441,7 +456,6 @@ public static void main(String[] args) throws IOException { // { "mis_Medf", "mis_Medf_NG" }, {"ku_Yezi", "ku_Yezi_GE"}, - {"und_EU", "en_Latn_IE"}, {"hnj", "hnj_Hmnp_US"}, // preferred lang/script in CLDR {"hnj_Hmnp", "hnj_Hmnp_US"}, {"und_Hmnp", "hnj_Hmnp_US"}, @@ -461,6 +475,25 @@ public static void main(String[] args) throws IOException { {"und_CC", "ms_Arab_CC"}, {"und_SL", "kri_Latn_SL"}, {"und_SS", "ar_Arab_SS"}, + + // additions for missing values from LikelySubtagsText + {"und_Arab_AF", "fa_Arab_AF"}, + {"und_Cyrl_BG", "bg_Cyrl_BG"}, + {"und_Tibt_BT", "dz_Tibt_BT"}, + {"und_Cyrl_BY", "be_Cyrl_BY"}, + {"und_Arab_CC", "ms_Arab_CC"}, + {"und_Ethi_ER", "ti_Ethi_ER"}, + {"und_Arab_IR", "fa_Arab_IR"}, + {"und_Cyrl_KG", "ky_Cyrl_KG"}, + {"und_Cyrl_MK", "mk_Cyrl_MK"}, + {"und_Cyrl_MN", "mn_Cyrl_MN"}, + {"und_Deva_NP", "ne_Deva_NP"}, + {"und_Cyrl_RS", "sr_Cyrl_RS"}, + {"und_Cyrl_TJ", "tg_Cyrl_TJ"}, + {"und_Cyrl_UA", "uk_Cyrl_UA"}, + {"arc_Hatr", "arc_Hatr_IQ"}, + {"hnj_Hmng", "hnj_Hmng_LA"}, + {"bap_Krai", "bap_Krai_IN"}, }); /** @@ -658,8 +691,11 @@ private static Map generatePopulationData(Map to // Set,Double> rowsToCounts = new TreeMap(); MaxData maxData = new MaxData(); Set cldrLocales = factory.getAvailable(); + // skip ZZ Set otherTerritories = - new TreeSet<>(standardCodes.getGoodAvailableCodes("territory")); + new TreeSet<>( + Sets.difference( + standardCodes.getGoodAvailableCodes("territory"), Set.of("ZZ"))); // process all the information to get the top values for each triple. // each of the combinations of 1 or 2 components gets to be a key. @@ -691,23 +727,26 @@ private static Map generatePopulationData(Map to if (data.getOfficialStatus() == OfficialStatus.unknown) { final String locale = writtenLanguage + "_" + region; - if (literatePopulation >= minimalLiteratePopulation) { - // ok, skip - } else if (literatePopulation >= MIN_UNOFFICIAL_CLDR_LANGUAGE_SIZE - && cldrLocales.contains(locale)) { - // ok, skip - } else { - // if (SHOW_ADD) - // System.out.println("Skipping:\t" + writtenLanguage + "\t" + region + "\t" - // + english.getName(locale) - // + "\t-- too small:\t" + number.format(literatePopulation)); - // continue; - } + // if (literatePopulation >= minimalLiteratePopulation) { + // // ok, skip + // } else if (literatePopulation >= + // MIN_UNOFFICIAL_CLDR_LANGUAGE_SIZE + // && cldrLocales.contains(locale)) { + // // ok, skip + // } else { + // // if (SHOW_ADD) + // // System.out.println("Skipping:\t" + writtenLanguage + // + "\t" + region + "\t" + // // + english.getName(locale) + // // + "\t-- too small:\t" + + // number.format(literatePopulation)); + // // continue; + // } order *= UNOFFICIAL_SCALE_DOWN; if (watching(SHOW_POP, writtenLanguage)) System.out.println( JOIN_TAB.join( - "Pop:", + "Scaling unofficial: ", writtenLanguage, region, getNameSafe(locale), @@ -797,23 +836,36 @@ private static Map generatePopulationData(Map to if (replacements == null) { continue; } - String goodLanguage = replacements.get(0); String badLanguage = str.getKey(); - if (badLanguage.contains("_")) { + if (badLanguage.contains("_")) { // only single subtag continue; } + if (deprecatedISONotInLST.contains(badLanguage)) { continue; } + + if (LANGUAGE_CODE_TO_STATUS.get(badLanguage) != Validity.Status.regular) { + if (!LocaleValidator.ALLOW_IN_LIKELY.isAllowed( + LstrType.language, badLanguage, null, null)) { + continue; + } + } + + // see what the values are for the replacements + + String goodLanguage = replacements.get(0); Set> goodLanguageData = maxData.languages.getAll(goodLanguage); if (goodLanguageData == null) { continue; } + R3 value = goodLanguageData.iterator().next(); final String script = value.get1(); final String region = value.get2(); + maxData.add(badLanguage, script, region, 1.0); System.out.println( "Adding aliases: " @@ -829,8 +881,8 @@ private static Map generatePopulationData(Map to // now, get the best for each one for (String language : maxData.languages.keySet()) { R3 value = maxData.languages.getAll(language).iterator().next(); - final Comparable script = value.get1(); - final Comparable region = value.get2(); + final String script = value.get1(); + final String region = value.get2(); add( language, language + "_" + script + "_" + region, @@ -992,12 +1044,29 @@ private static Map generatePopulationData(Map to TreeSet sorted = new TreeSet<>(ScriptMetadata.getScripts()); for (String script : sorted) { + switch (SCRIPT_CODE_TO_STATUS.get(script)) { + case special: + case unknown: + continue; + default: + break; + } Info i = ScriptMetadata.getInfo(script); String likelyLanguage = i.likelyLanguage; + String originCountry = i.originCountry; if (LANGUAGE_CODE_TO_STATUS.get(likelyLanguage) == Status.special) { likelyLanguage = LocaleNames.UND; } - String originCountry = i.originCountry; + LanguageTagParser ltp = + new LanguageTagParser() + .setLanguage(likelyLanguage) + .setScript(script) + .setRegion(originCountry); + Set errors = new LinkedHashSet<>(); + if (!LocaleValidator.isValid(ltp, LocaleValidator.ALLOW_IN_LIKELY, errors)) { + System.out.println(JOIN_LS.join("Failure in ScriptMetaData: " + ltp, errors)); + continue; + } final String result = likelyLanguage + "_" + script + "_" + originCountry; add("und_" + script, result, toMaximized, "S->LR•", LocaleOverride.KEEP_EXISTING); add(likelyLanguage, result, toMaximized, "L->SR•", LocaleOverride.KEEP_EXISTING); @@ -1044,7 +1113,7 @@ private static Map generatePopulationData(Map to toMaximized.remove(row.get(0)); } } - return toMaximized; + return CldrUtility.protectCollection(toMaximized); } /** Class for maximizing data sources */ @@ -1161,12 +1230,8 @@ private static void add( "", kind)); } - } else if (override == LocaleOverride.KEEP_EXISTING || value.equals(oldValue)) { - // if (showAction) { - // System.out.println("Skipping:\t" + key + "\t→\t" + value + "\t\t\t\t" + kind); - // } - return; - } else { + toAdd.put(key, value); + } else if (override != LocaleOverride.KEEP_EXISTING && !value.equals(oldValue)) { if (watching(showAction, key, value)) { System.out.println( JOIN_TAB.join( @@ -1182,8 +1247,8 @@ private static void add( getNameSafe(oldValue), kind)); } + toAdd.put(key, value); } - toAdd.put(key, value); } public static String truncateLongString(Object data, int maxLen) { @@ -1197,68 +1262,152 @@ public static String truncateLongString(Object data, int maxLen) { return info; } + /** + * Minimize
+ * We know that the following algorithm will be used in the lookup, so we remove mappings that + * are redundant. https://cldr-smoke.unicode.org/spec/main/ldml/tr35.html#likely-subtags
+ * A subtag is called empty if it is a missing script or region subtag, or it is a base language + * subtag with the value "und". In the description below, a subscript on a subtag x indicates + * which tag it is from: xs is in the source, xm is in a match, and xr is in the final result. + * + *

Lookup. Look up each of the following in order, and stop on the first match: + * + *

    + *
  1. languages_scripts_regions + *
  2. languages_scripts + *
  3. languages_regions + *
  4. languages + *
+ * + *

Return + * + *

+ * + *

    + *
  1. If there is no match, signal an error and stop. + *
  2. Otherwise there is a match = languagem_scriptm_regionm + *
  3. Let xr = xs if xs is neither empty nor 'und', and xm otherwise. + *
  4. Return the language tag composed of languager_scriptr_regionr + variants + extensions. + *
+ */ public static Map minimize( - Map fluffup, Map itemsRemoved) { - LanguageTagParser parser = new LanguageTagParser(); + Map max, Map itemsRemoved) { + LanguageTagParser sourceParser = new LanguageTagParser(); LanguageTagParser targetParser = new LanguageTagParser(); + LanguageTagParser tempParser = new LanguageTagParser(); Map removals = new TreeMap<>(); + Map toMinimize = new TreeMap<>(LOCALE_SOURCE); + toMinimize.putAll(max); + + // We should never have an LocaleScriptInfo.UNKNOWN_REGION, or + // LocaleScriptInfo.UNKNOWN_SCRIPT + // The unit tests will guarantee this if somehow we slip up + // Similarly, we should never have the target have language="und", or be missing script or + // region + // We also know that the source never has 3 full fields (ie, never L≠und && S≠"" && R≠"") + + // We remove redundant mappings. For example + // For example, suppose we have the following mappings: + // {aa=aa_Latn_ET, aa_DJ=aa_Latn_DJ, aa_ER=aa_Latn_ER} + // Using the algorithm above if aa_DJ=aa_Latn_DJ were not there we would + // 1. check for aa_DJ, fail + // 2. check for aa, get aa_Latn_ET, and substitute DJ for ET, getting the right answer. + + // Make multiple passes if necessary for (int pass = 0; ; ++pass) { removals.clear(); - for (Entry entry : fluffup.entrySet()) { - String locale = entry.getKey(); + for (Entry entry : toMinimize.entrySet()) { + String source = entry.getKey(); String target = entry.getValue(); - - if (targetParser.set(target).getRegion().equals(LocaleScriptInfo.UNKNOWN_REGION)) { - removals.put(locale, target); - showRemoving(pass, locale, target, "Unknown Region in target"); - continue; - } - if (targetParser.getScript().equals(LocaleScriptInfo.UNKNOWN_SCRIPT)) { - removals.put(locale, target); - showRemoving(pass, locale, target, "Unknown Script in target"); - continue; + sourceParser.set(source); + String sLang = sourceParser.getLanguage(); + String sScript = sourceParser.getScript(); + String sRegion = sourceParser.getRegion(); + boolean realSLang = !sLang.equals("und"); + boolean realSScript = !sScript.isEmpty(); + boolean realSRegion = !sRegion.isEmpty(); + + if (realSLang && realSScript && realSRegion) { + throw new IllegalArgumentException("Bogus source: " + source); } - String region = parser.set(locale).getRegion(); - if (region.length() != 0) { - if (region.equals(LocaleScriptInfo.UNKNOWN_REGION)) { - removals.put(locale, target); - showRemoving(pass, locale, target, "Unknown Region in source"); + targetParser.set(target); + // String tLang = targetParser.getLanguage(); + // String tScript = targetParser.getScript(); + // String tRegion = targetParser.getRegion(); + + if (realSLang && realSScript) { // see if either singleton gives us the region + String possibleSuper = toMinimize.get(sLang); + // if the target is the same except for the script, we remove + tempParser.set(possibleSuper).setScript(sScript); + if (target.equals(tempParser.toString())) { + removals.put(source, target); + showRemoving( + pass, + source, + target, + "Redundant with\t" + sLang + " => " + possibleSuper); continue; } - parser.setRegion(""); - String newLocale = parser.toString(); - String newTarget = fluffup.get(newLocale); - if (newTarget != null) { - newTarget = targetParser.set(newTarget).setRegion(region).toString(); - if (target.equals(newTarget) && !KEEP_TARGETS.contains(locale)) { - removals.put(locale, target); - showRemoving(pass, locale, target, "Redundant with\t" + newLocale); - continue; - } - } - } - String script = parser.set(locale).getScript(); - if (locale.equals(DEBUG_ADD_KEY)) { - System.out.println("*debug*"); - } - if (script.length() != 0) { - if (script.equals(LocaleScriptInfo.UNKNOWN_SCRIPT)) { - removals.put(locale, target); - showRemoving(pass, locale, target, "Unknown Script"); + // possibleSuper = toMinimize.get("und_" + sScript); + // if (target.equals(possibleSuper)) { + // removals.put(source, target); + // showRemoving( + // pass, + // source, + // target, + // "Redundant with\t" + "und_" + sScript + " => " + // + possibleSuper); + // continue; + // } + } else if (realSLang + && realSRegion) { // see if either singleton gives us the script + String possibleSuper = toMinimize.get(sLang); + tempParser.set(possibleSuper).setRegion(sRegion); + if (target.equals(tempParser.toString())) { + removals.put(source, target); + showRemoving( + pass, + source, + target, + "Redundant with\t" + sLang + " => " + possibleSuper); continue; } - parser.setScript(""); - String newLocale = parser.toString(); - String newTarget = fluffup.get(newLocale); - if (newTarget != null) { - newTarget = targetParser.set(newTarget).setScript(script).toString(); - if (target.equals(newTarget) && !KEEP_TARGETS.contains(locale)) { - removals.put(locale, target); - showRemoving(pass, locale, target, "Redundant with\t" + newLocale); - continue; - } - } + // possibleSuper = toMinimize.get("und_" + sRegion); + // if (target.equals(possibleSuper)) { + // removals.put(source, target); + // showRemoving( + // pass, + // source, + // target, + // "Redundant with\t" + "und_" + sScript + " => " + // + possibleSuper); + // continue; + // } + // } else if (hasScript && hasRegion) { // see if some singleton + // gives us the language + // String possibleSuper = toMinimize.get("und_" + sScript); + // if (target.equals(possibleSuper)) { + // removals.put(source, target); + // showRemoving( + // pass, + // source, + // target, + // "Redundant with\t" + "und_" + sScript + " => " + // + possibleSuper); + // continue; + // } + // possibleSuper = toMinimize.get("und_" + sRegion); + // if (target.equals(possibleSuper)) { + // removals.put(source, target); + // showRemoving( + // pass, + // source, + // target, + // "Redundant with\t" + "und_" + sRegion + " => " + // + possibleSuper); + // continue; + // } } } if (removals.size() == 0) { @@ -1266,10 +1415,14 @@ public static Map minimize( } itemsRemoved.putAll(removals); for (String locale : removals.keySet()) { - fluffup.remove(locale); + toMinimize.remove(locale); } } - return fluffup; + return CldrUtility.protectCollection(toMinimize); + } + + static class MapView { + K skip; } public static void showRemoving( diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/util/LocaleValidator.java b/tools/cldr-code/src/main/java/org/unicode/cldr/util/LocaleValidator.java index 0a6532e4d59..b83c5b57d59 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/util/LocaleValidator.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/util/LocaleValidator.java @@ -22,6 +22,15 @@ public class LocaleValidator { static final SupplementalDataInfo SDI = SupplementalDataInfo.getInstance(); + /** For backwards compatibility, certain non-regular codes are allowed in LikelySubtags. */ + public static final LocaleValidator.AllowedValid ALLOW_IN_LIKELY = + new LocaleValidator.AllowedValid( + null, + LstrType.region, + new LocaleValidator.AllowedMatch("001|419"), + LstrType.language, + new LocaleValidator.AllowedMatch("und|in|iw|ji|jw|mo|tl")); + static final Validity VALIDITY = Validity.getInstance(); static final Set FIELD_ALLOWS_EMPTY = Set.of(LstrType.script, LstrType.region); // Map>> @@ -100,18 +109,21 @@ public static class AllowedValid { private final Set allowedStatus; // allowed without exception private final Multimap allowedExceptions; - boolean isAllowed(Validity.Status status) { + public boolean isAllowed(Validity.Status status) { return allowedStatus.contains(status); } /** Only called if isAllowed is not true */ - boolean isAllowed(LstrType lstrType, String key, String value, Validity.Status status) { + public boolean isAllowed( + LstrType lstrType, String key, String value, Validity.Status status) { Collection allowedMatches = allowedExceptions.get(lstrType); if (allowedMatches == null) { return false; } for (AllowedMatch allowedMatch : allowedMatches) { - if (allowedMatch.matches(key, value, status)) return true; + if (allowedMatch.matches(key, value, status)) { + return true; + } } return false; } diff --git a/tools/cldr-code/src/main/resources/org/unicode/cldr/util/data/Script_Metadata.csv b/tools/cldr-code/src/main/resources/org/unicode/cldr/util/data/Script_Metadata.csv index bdf7170097b..408ab50a7ab 100644 --- a/tools/cldr-code/src/main/resources/org/unicode/cldr/util/data/Script_Metadata.csv +++ b/tools/cldr-code/src/main/resources/org/unicode/cldr/util/data/Script_Metadata.csv @@ -128,7 +128,7 @@ WR,Name,Script_Code,Age,Size,Sample,Sample_Code,Origin Country,~Density,Likely L 126,Warang_Citi,Wara,7.0,84,𑢴,118B4,India,1,Ho,hoc,Exclusion,no,no,no,no,Yes 127,Ahom,Ahom,8.0,0,𑜗,11717,India,1,Ahom,aho,Exclusion,no,Yes,Yes,no,no 128,Anatolian_Hieroglyphs,Hluw,8.0,0,𔐀,14400,Turkey,1,Hieroglyphic Luwian,hlu,Exclusion,no,no,no,Yes,no -129,Hatran,Hatr,8.0,0,𐣴,108F4,Iraq,1,Uncoded Languages,mis,Exclusion,Yes,no,no,no,no +129,Hatran,Hatr,8.0,0,𐣴,108F4,Iraq,1,Aramaic,arc,Exclusion,Yes,no,no,no,no 130,Multani,Mult,8.0,0,𑊏,1128F,Pakistan,1,Seraiki,skr,Exclusion,no,no,no,no,no 131,Old_Hungarian,Hung,8.0,0,𐲡,10CA1,Hungary,1,Hungarian,hu,Exclusion,Yes,no,no,no,Yes 132,SignWriting,Sgnw,8.0,0,𝡐,1D850,USA,1,American Sign Language,ase,Exclusion,no,no,no,Yes,no @@ -158,7 +158,7 @@ WR,Name,Script_Code,Age,Size,Sample,Sample_Code,Origin Country,~Density,Likely L 156,Khitan small script,Kits,13.0,0,𘱥,18C65,China,2,Khitan,zkt,Exclusion,no,Yes,no,Yes,no 157,Yezidi,Yezi,13.0,0,𐺈,10E88,Georgia,1,Northern Kurdish,ku,Exclusion,Yes,no,no,no,no 158,Cypro_Minoan,Cpmn,14.0,0,𒿥,12FE5,Cyprus,2,unknown,und,Exclusion,no,no,no,Yes,no -159,Old_Uyghur,Ougr,14.0,0,𐽼,10F7C,Central Asia,1,Old Uyghur,oui,Exclusion,Yes,no,Yes,no,no +159,Old_Uyghur,Ougr,14.0,0,𐽼,10F7C,China,1,Old Uyghur,oui,Exclusion,Yes,no,Yes,no,no 160,Tangsa,Tnsa,14.0,0,𖪼,16ABC,India,1,Tangsa,nst,Exclusion,no,no,no,no,no 161,Toto,Toto,14.0,0,𞊐,1E290,India,1,Toto,txo,Exclusion,no,no,no,no,no 162,Vithkuqi,Vith,14.0,0,𐖂,10582,Albania,1,Albanian,sq,Exclusion,no,no,no,no,Yes diff --git a/tools/cldr-code/src/main/resources/org/unicode/cldr/util/data/country_language_population.tsv b/tools/cldr-code/src/main/resources/org/unicode/cldr/util/data/country_language_population.tsv index cf82e12dde7..64c5e34ffd8 100644 --- a/tools/cldr-code/src/main/resources/org/unicode/cldr/util/data/country_language_population.tsv +++ b/tools/cldr-code/src/main/resources/org/unicode/cldr/util/data/country_language_population.tsv @@ -1354,7 +1354,7 @@ Turkey TR "81,257,239" 94% "2,186,000,000,000" Balkan Gagauz Turkish bgx "370,0 Turkey TR "81,257,239" 94% "2,186,000,000,000" Bulgarian bg "341,000" Turkey TR "81,257,239" 94% "2,186,000,000,000" English en 17% Turkey TR "81,257,239" 94% "2,186,000,000,000" Georgian ka "45,300" -Turkey TR "81,257,239" 94% "2,186,000,000,000" Kara-Kalpak kaa 1% https://joshuaproject.net/languages/kaa +Turkey TR "81,257,239" 94% "2,186,000,000,000" Kara-Kalpak kaa 0.1% https://joshuaproject.net/languages/kaa Turkey TR "81,257,239" 94% "2,186,000,000,000" Greek el "4,000" Turkey TR "81,257,239" 94% "2,186,000,000,000" Kabardian kbd "623,000" Turkey TR "81,257,239" 94% "2,186,000,000,000" Kazakh kk 600 "http://en.wikipedia.org/wiki/Kazakh_language - the script is an assumption, needs a reference" diff --git a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/LikelySubtagsTest.java b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/LikelySubtagsTest.java index 6bfa2a607ff..e6a40a0e828 100644 --- a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/LikelySubtagsTest.java +++ b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/LikelySubtagsTest.java @@ -39,11 +39,11 @@ import org.unicode.cldr.util.Factory; import org.unicode.cldr.util.LanguageTagParser; import org.unicode.cldr.util.Level; +import org.unicode.cldr.util.LocaleValidator; import org.unicode.cldr.util.ScriptToExemplars; import org.unicode.cldr.util.StandardCodes; import org.unicode.cldr.util.StandardCodes.LstrType; import org.unicode.cldr.util.SupplementalDataInfo; -import org.unicode.cldr.util.SupplementalDataInfo.PopulationData; import org.unicode.cldr.util.Validity; import org.unicode.cldr.util.Validity.Status; @@ -703,85 +703,60 @@ public void testUndAllScriptsAndRegions() { } } - LanguageTagParser ltp = new LanguageTagParser(); - Set possibleFixes = new TreeSet<>(); - for (String region : regions) { - final String undRegion = "und_" + region; - if (region.equals("150") && likely.containsKey("und")) { - // skip - } else if (!assertTrue("contains und_" + region, likely.containsKey(undRegion))) { - Set languages = - SUPPLEMENTAL_DATA_INFO.getLanguagesForTerritoryWithPopulationData(region); - double biggest = -1; - String biggestLang = null; - for (String language : languages) { - PopulationData popData = - SUPPLEMENTAL_DATA_INFO.getLanguageAndTerritoryPopulationData( - language, region); - if (popData.getLiteratePopulation() > biggest) { - biggest = popData.getLiteratePopulation(); - biggestLang = language; - } - } - if (biggestLang != null) { - ltp.set(biggestLang); - if (ltp.getScript().isEmpty()) { - String biggestMax = likely.get(biggestLang); - ltp.set(biggestMax); - } - ltp.setRegion(region); - possibleFixes.add( - ""); - } - } - } - System.out.println("\t\t" + Joiner.on("\n\t\t").join(possibleFixes)); + + // Note: this used to test for all combinations of und_ + territory code. + // But we are now dropping redundant items, so any case where und_XX expands to en_Latn_XX, + // the und_XX is dropped. + // The code is just commented out in case we change in the future. + + // LanguageTagParser ltp = new LanguageTagParser(); + // Set possibleFixes = new TreeSet<>(); + // for (String region : regions) { + // final String undRegion = "und_" + region; + // if (region.equals("150") && likely.containsKey("und")) { + // // skip + // } else if (!assertTrue("contains und_" + region, + // likely.containsKey(undRegion))) { + // Set languages = + // + // SUPPLEMENTAL_DATA_INFO.getLanguagesForTerritoryWithPopulationData(region); + // double biggest = -1; + // String biggestLang = null; + // for (String language : languages) { + // PopulationData popData = + // SUPPLEMENTAL_DATA_INFO.getLanguageAndTerritoryPopulationData( + // language, region); + // if (popData.getLiteratePopulation() > biggest) { + // biggest = popData.getLiteratePopulation(); + // biggestLang = language; + // } + // } + // if (biggestLang != null) { + // ltp.set(biggestLang); + // if (ltp.getScript().isEmpty()) { + // String biggestMax = likely.get(biggestLang); + // ltp.set(biggestMax); + // } + // ltp.setRegion(region); + // possibleFixes.add( + // ""); + // } + // } + // } + // System.out.println("\t\t" + Joiner.on("\n\t\t").join(possibleFixes)); } + private static final Joiner JOIN_LS = Joiner.on(CldrUtility.LINE_SEPARATOR); + public void testToAttributeValidityStatus() { - Set okLanguages = VALIDITY.getStatusToCodes(LstrType.language).get(Status.regular); - Set okScripts = VALIDITY.getStatusToCodes(LstrType.script).get(Status.regular); - Set okRegions = VALIDITY.getStatusToCodes(LstrType.region).get(Status.regular); Multimap badFieldsToLocales = TreeMultimap.create(); - Set knownExceptions = Set.of("in", "iw", "ji", "jw", "mo", "tl"); for (String s : likely.values()) { - CLDRLocale cLocale = CLDRLocale.getInstance(s); - final String language = cLocale.getLanguage(); - final String script = cLocale.getScript(); - final String region = cLocale.getCountry(); - if (!okLanguages.contains(language)) { - if (knownExceptions.contains(language)) { - continue; - } - badFieldsToLocales.put(language, s); - } - if (!okScripts.contains(script)) { - badFieldsToLocales.put(script, s); - } - if (!okRegions.contains(region)) { - badFieldsToLocales.put(region, s); - } - } - if (!badFieldsToLocales.isEmpty()) { - Multimap statusToExamples = TreeMultimap.create(); - for (String field : badFieldsToLocales.keySet()) { - Status status = VALIDITY.getCodeToStatus(LstrType.language).get(field); - if (status == null) { - status = VALIDITY.getCodeToStatus(LstrType.script).get(field); - } - if (status == null) { - status = VALIDITY.getCodeToStatus(LstrType.region).get(field); - } - statusToExamples.put(status, field); - } - Map fieldToOrigin = new TreeMap<>(); - for (Entry> entry : statusToExamples.asMap().entrySet()) { - // for (String value : entry.getValue()) { - // String origin = - // SUPPLEMENTAL_DATA_INFO.getLikelyOrigins().get(value); - // fieldToOrigin.put(value, origin == null ? "n/a" : origin); - // } - warnln("Bad status=" + entry.getKey() + " for " + entry.getValue()); + LanguageTagParser ltp = new LanguageTagParser().set(s); + Set errors = new LinkedHashSet<>(); + if (!LocaleValidator.isValid(ltp, LocaleValidator.ALLOW_IN_LIKELY, errors)) { + errln(Joiner.on('\t').join("Allowed subtag failure:", ltp, errors)); + continue; } } } diff --git a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestLocale.java b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestLocale.java index bd269f492ee..353a724fae5 100644 --- a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestLocale.java +++ b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestLocale.java @@ -900,14 +900,6 @@ public void testLanguageTagParserIsValid() { // likely subtags - LocaleValidator.AllowedValid allow001 = - new LocaleValidator.AllowedValid( - null, - LstrType.region, - new LocaleValidator.AllowedMatch("001|419"), - LstrType.language, - new LocaleValidator.AllowedMatch("und|in|iw|ji|jw|mo|tl")); - Map exceptions = Map.of( // "und_QO", "Disallowed region=QO, status=macroregion" @@ -918,13 +910,13 @@ public void testLanguageTagParserIsValid() { final String value = entry.getValue(); String expected = CldrUtility.ifNull(exceptions.get(key), ""); - LocaleValidator.isValid(ltp.set(key), allow001, errors); + LocaleValidator.isValid(ltp.set(key), LocaleValidator.ALLOW_IN_LIKELY, errors); assertEquals(key, expected, Joiner.on("; ").join(errors)); if (!expected.isEmpty()) { warnln("Likely subtags, skipping " + ltp + ", " + expected); } - LocaleValidator.isValid(ltp.set(value), allow001, errors); + LocaleValidator.isValid(ltp.set(value), LocaleValidator.ALLOW_IN_LIKELY, errors); assertEquals(value, "", Joiner.on("; ").join(errors)); }