From e8ca95fe71154f41841cb4a5b13db7fb5a0142e5 Mon Sep 17 00:00:00 2001 From: macchiati Date: Sun, 25 Feb 2024 23:07:56 -0800 Subject: [PATCH 1/2] CLDR-17407 check for anomalies and fix --- common/properties/coverageLevels.txt | 1 - common/supplemental/coverageLevels.xml | 16 +-- .../org/unicode/cldr/util/StandardCodes.java | 6 +- .../cldr/util/SupplementalDataInfo.java | 106 +++++++++++++++++- .../org/unicode/cldr/util/data/Locales.txt | 8 +- .../unicode/cldr/util/TestStandardCodes.java | 10 ++ 6 files changed, 131 insertions(+), 16 deletions(-) diff --git a/common/properties/coverageLevels.txt b/common/properties/coverageLevels.txt index fc4a5d31df7..3a2bb78c703 100644 --- a/common/properties/coverageLevels.txt +++ b/common/properties/coverageLevels.txt @@ -144,7 +144,6 @@ th ; modern ; Thai ti ; basic ; Tigrinya tk ; modern ; Turkmen to ; basic ; Tongan -tok ; basic ; Toki Pona tr ; modern ; Turkish tt ; basic ; Tatar ug ; basic ; Uyghur diff --git a/common/supplemental/coverageLevels.xml b/common/supplemental/coverageLevels.xml index 457cdc18660..9f36c83ae58 100644 --- a/common/supplemental/coverageLevels.xml +++ b/common/supplemental/coverageLevels.xml @@ -939,12 +939,14 @@ For terms of use, see http://www.unicode.org/copyright.html - + @@ -956,14 +958,14 @@ For terms of use, see http://www.unicode.org/copyright.html - - - - - - + + + + + + diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/util/StandardCodes.java b/tools/cldr-code/src/main/java/org/unicode/cldr/util/StandardCodes.java index f3c27281717..02f8f9fe478 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/util/StandardCodes.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/util/StandardCodes.java @@ -149,6 +149,10 @@ private Map> getCodeData(CodeType type) { return type_code_data.get(type); } + public Set getCodes(CodeType type) { + return type_code_data.get(type).keySet(); + } + /** * Get at the language registry values, as a Map from label to value. * @@ -275,7 +279,7 @@ public Set getGoodAvailableCodes(CodeType type) { case script: return sd.getCLDRScriptCodes(); case tzid: - break; // nothing special + return sd.getCLDRTimezoneCodes(); default: for (Iterator it = result.iterator(); it.hasNext(); ) { String code = it.next(); diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/util/SupplementalDataInfo.java b/tools/cldr-code/src/main/java/org/unicode/cldr/util/SupplementalDataInfo.java index b2bb5111875..1ad85a95a85 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/util/SupplementalDataInfo.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/util/SupplementalDataInfo.java @@ -4,6 +4,8 @@ import com.google.common.base.Joiner; import com.google.common.base.Splitter; +import com.google.common.base.Supplier; +import com.google.common.base.Suppliers; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableSet; import com.google.common.collect.ImmutableSetMultimap; @@ -74,6 +76,7 @@ import org.unicode.cldr.util.GrammarInfo.GrammaticalScope; import org.unicode.cldr.util.GrammarInfo.GrammaticalTarget; import org.unicode.cldr.util.Rational.RationalParser; +import org.unicode.cldr.util.StandardCodes.CodeType; import org.unicode.cldr.util.StandardCodes.LstrType; import org.unicode.cldr.util.SupplementalDataInfo.BasicLanguageData.Type; import org.unicode.cldr.util.SupplementalDataInfo.NumberingSystemInfo.NumberingSystemType; @@ -981,6 +984,10 @@ public enum RBNFGroup { public Map, String> bcp47Since = new TreeMap<>(); public Map, String> bcp47Preferred = new TreeMap<>(); public Map, String> bcp47Deprecated = new TreeMap<>(); + + Map> bcp47KeyToSubtypeToInfo = new TreeMap<>(); + Map> bcp47KeyToAliasToSubtype = new TreeMap<>(); + public Map bcp47ValueType = new TreeMap<>(); public Map> validityInfo = new LinkedHashMap<>(); @@ -1145,6 +1152,34 @@ private SupplementalDataInfo(File directory) { this.validity = Validity.getInstance(directory.toString() + "/../validity/"); } // hide + public static class Bcp47KeyInfo { + public Bcp47KeyInfo( + Set aliases, + String description, + String since, + String preferred, + String deprecated) { + this.description = description; + this.deprecated = !(deprecated == null || deprecated.equals("false")); + this.preferred = preferred; + this.since = since == null ? null : VersionInfo.getInstance(since); + this.aliases = aliases; + } + + final String description; + final VersionInfo since; + final String preferred; + final boolean deprecated; + final Set aliases; + + @Override + public String toString() { + return String.format( + "{description=«%s» since=%s preferred=%s deprecated=%s aliases=%s}", + description, since, preferred, deprecated, aliases); + } + } + private void makeStuffSafe() { // now make stuff safe allLanguages.addAll(languageToPopulation.keySet()); @@ -1224,19 +1259,54 @@ private void makeStuffSafe() { } typeToLocaleToDayPeriodInfo = CldrUtility.protectCollection(typeToLocaleToDayPeriodInfo); languageMatch = CldrUtility.protectCollection(languageMatch); - bcp47Key2Subtypes.freeze(); + bcp47Extension2Keys.freeze(); - bcp47Aliases.freeze(); + bcp47Key2Subtypes.freeze(); + CldrUtility.protectCollection(bcp47ValueType); if (bcp47Key2Subtypes.isEmpty()) { throw new InternalError( "No BCP47 key 2 subtype data was loaded from bcp47 dir " + getBcp47Directory().getAbsolutePath()); } + + bcp47Aliases.freeze(); CldrUtility.protectCollection(bcp47Descriptions); CldrUtility.protectCollection(bcp47Since); CldrUtility.protectCollection(bcp47Preferred); CldrUtility.protectCollection(bcp47Deprecated); - CldrUtility.protectCollection(bcp47ValueType); + + // create clean structure + + for (Entry> entry : bcp47Extension2Keys.keyValuesSet()) { + for (String key : entry.getValue()) { + Map subtypeToInfo = bcp47KeyToSubtypeToInfo.get(key); + if (subtypeToInfo == null) { + bcp47KeyToSubtypeToInfo.put(key, subtypeToInfo = new TreeMap<>()); + } + Map aliasToRegular = bcp47KeyToAliasToSubtype.get(key); + if (aliasToRegular == null) { + bcp47KeyToAliasToSubtype.put(key, aliasToRegular = new TreeMap<>()); + } + for (String subtype : bcp47Key2Subtypes.get(key)) { + final R2 pair = R2.of(key, subtype); + final Set aliases = bcp47Aliases.get(pair); + final Bcp47KeyInfo info = + new Bcp47KeyInfo( + aliases, + bcp47Descriptions.get(pair), + bcp47Since.get(pair), + bcp47Preferred.get(pair), + bcp47Deprecated.get(pair)); + subtypeToInfo.put(subtype, info); + final Map aliasToRegularFinal = aliasToRegular; + if (aliases != null) { + aliases.forEach(x -> aliasToRegularFinal.put(x, subtype)); + } + } + } + } + bcp47KeyToSubtypeToInfo = CldrUtility.protectCollection(bcp47KeyToSubtypeToInfo); + bcp47KeyToAliasToSubtype = CldrUtility.protectCollection(bcp47KeyToAliasToSubtype); CoverageLevelInfo.fixEU(coverageLevels, this); coverageLevels = Collections.unmodifiableSortedSet(coverageLevels); @@ -5173,4 +5243,34 @@ public UnitPrefixInfo getUnitPrefixInfo(String prefix) { public Set getUnitPrefixes() { return unitPrefixInfo.keySet(); } + + Supplier> goodTimezones = + Suppliers.memoize( + new Supplier>() { + @Override + public Set get() { + Set availableLongTz = sc.getAvailableCodes(CodeType.tzid); + Map aliasToRegular = bcp47KeyToAliasToSubtype.get("tz"); + Map subtypeToInfo = + bcp47KeyToSubtypeToInfo.get("tz"); + Set result = + availableLongTz.stream() + .filter( + x -> { + String shortId = aliasToRegular.get(x); + Bcp47KeyInfo info = + subtypeToInfo.get(shortId); + if (info.deprecated) { + System.out.println("deprecated: " + x); + } + return info.deprecated; + }) + .collect(Collectors.toUnmodifiableSet()); + return result; + } + }); + + public Set getCLDRTimezoneCodes() { + return goodTimezones.get(); + } } diff --git a/tools/cldr-code/src/main/resources/org/unicode/cldr/util/data/Locales.txt b/tools/cldr-code/src/main/resources/org/unicode/cldr/util/data/Locales.txt index 26646a2211a..0422bacff8a 100644 --- a/tools/cldr-code/src/main/resources/org/unicode/cldr/util/data/Locales.txt +++ b/tools/cldr-code/src/main/resources/org/unicode/cldr/util/data/Locales.txt @@ -207,7 +207,7 @@ Google ; nn ; Modern ; Nynorsk Google ; no ; modern ; T2 Norwegian (Bokmål) Google ; or ; modern ; T5 Odia Google ; pa ; modern ; T4.1 Punjabi -Google ; pcm ; modern ; Nigerian Pidgin +Google ; pcm ; moderate ; Nigerian Pidgin Google ; pl ; modern ; T1 Polish Google ; ps ; modern ; T5 Pashto Google ; pt ; modern ; T1 Brazilian Portuguese @@ -283,7 +283,7 @@ Apple ; kn ; modern Apple ; ko ; modern Apple ; lt ; modern Apple ; lv ; modern -Apple ; mi ; modern +Apple ; mi ; moderate Apple ; mk ; modern Apple ; ml ; modern Apple ; mr ; modern @@ -485,7 +485,7 @@ Cldr ; cv ; basic Cldr ; en_AU ; modern Cldr ; es_MX ; modern Cldr ; fr_CA ; modern -Cldr ; mi ; modern +Cldr ; mi ; moderate Cldr ; zh_Hant_HK ; modern #Cldr other (from Google) @@ -516,7 +516,7 @@ Cldr ; su ; basic ; Sundanese (script TBD) Cldr ; ks_Deva ; basic ; Kashmiri (Devanagari) Cldr ; sd_Deva ; basic ; Sindhi (Devanagari script) # Cldr ; cad ; basic ; Caddo -Cldr ; pcm ; modern ; Nigerian Pidgin +Cldr ; pcm ; moderate ; Nigerian Pidgin Cldr ; bgc ; basic ; Haryanvi Cldr ; bho ; basic ; Bhojpuri diff --git a/tools/cldr-code/src/test/java/org/unicode/cldr/util/TestStandardCodes.java b/tools/cldr-code/src/test/java/org/unicode/cldr/util/TestStandardCodes.java index 34eea77d547..0c3983a2b57 100644 --- a/tools/cldr-code/src/test/java/org/unicode/cldr/util/TestStandardCodes.java +++ b/tools/cldr-code/src/test/java/org/unicode/cldr/util/TestStandardCodes.java @@ -1,8 +1,11 @@ package org.unicode.cldr.util; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertNotNull; +import java.util.Set; +import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.CsvSource; @@ -30,4 +33,11 @@ void testTargetCoverageLevel(final String locale, final String level) { "Expected getTargetCoverageLevel(%s)=%s but was %s", locale, expectLevel, actualLevel)); } + + @Test + void testTimezoneExclusions() { + SupplementalDataInfo sdi = SupplementalDataInfo.getInstance(); + Set timezones = sdi.getCLDRTimezoneCodes(); + assertFalse(timezones.contains("America/Nipigon")); + } } From aa8d28250ae79e5a762a8af623cf048edb8c4073 Mon Sep 17 00:00:00 2001 From: macchiati Date: Mon, 26 Feb 2024 13:41:16 -0800 Subject: [PATCH 2/2] CLDR-17407 Revert changes to beaufort, add hack to address deprecated timezone ids. --- common/supplemental/coverageLevels.xml | 16 +++-- .../cldr/util/SupplementalDataInfo.java | 63 ++++++++++++++----- .../cldr/unittest/TestExampleGenerator.java | 10 ++- .../unicode/cldr/util/TestStandardCodes.java | 2 + 4 files changed, 60 insertions(+), 31 deletions(-) diff --git a/common/supplemental/coverageLevels.xml b/common/supplemental/coverageLevels.xml index 9f36c83ae58..457cdc18660 100644 --- a/common/supplemental/coverageLevels.xml +++ b/common/supplemental/coverageLevels.xml @@ -939,14 +939,12 @@ For terms of use, see http://www.unicode.org/copyright.html - - --> @@ -958,13 +956,13 @@ For terms of use, see http://www.unicode.org/copyright.html - - - - - - + + + + + + diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/util/SupplementalDataInfo.java b/tools/cldr-code/src/main/java/org/unicode/cldr/util/SupplementalDataInfo.java index 1ad85a95a85..062c0e8dfcc 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/util/SupplementalDataInfo.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/util/SupplementalDataInfo.java @@ -10,6 +10,7 @@ import com.google.common.collect.ImmutableSet; import com.google.common.collect.ImmutableSetMultimap; import com.google.common.collect.Multimap; +import com.google.common.collect.Sets; import com.google.common.collect.TreeMultimap; import com.ibm.icu.impl.IterableComparator; import com.ibm.icu.impl.Relation; @@ -5244,28 +5245,58 @@ public Set getUnitPrefixes() { return unitPrefixInfo.keySet(); } + /** + * Filter out deprecated items. This is more complicated than it seems. The deprecation is in + * timezones.xml, eg: We need to find the + * short id's that are deprecated, put there is a problem due to + * https://unicode-org.atlassian.net/browse/CLDR-17412. + * + *

America/Nipigon, America/Thunder_Bay, America/Rainy_River + */ Supplier> goodTimezones = Suppliers.memoize( new Supplier>() { + @Override public Set get() { Set availableLongTz = sc.getAvailableCodes(CodeType.tzid); - Map aliasToRegular = bcp47KeyToAliasToSubtype.get("tz"); - Map subtypeToInfo = - bcp47KeyToSubtypeToInfo.get("tz"); - Set result = - availableLongTz.stream() - .filter( - x -> { - String shortId = aliasToRegular.get(x); - Bcp47KeyInfo info = - subtypeToInfo.get(shortId); - if (info.deprecated) { - System.out.println("deprecated: " + x); - } - return info.deprecated; - }) - .collect(Collectors.toUnmodifiableSet()); + Set result = null; + if (true) { // hack for now + final Set hack = + Set.of( + "America/Santa_Isabel", + "Australia/Currie", + "America/Yellowknife", + "America/Rainy_River", + "America/Thunder_Bay", + "America/Nipigon", + "America/Pangnirtung", + "Europe/Uzhgorod", + "Europe/Zaporozhye", + "Pacific/Johnston"); + result = Set.copyOf(Sets.difference(availableLongTz, hack)); + } else { // TODO restore when CLDR-17412 is fixed + Map aliasToRegular = + bcp47KeyToAliasToSubtype.get("tz"); + Map subtypeToInfo = + bcp47KeyToSubtypeToInfo.get("tz"); + result = + availableLongTz.stream() + .filter( + x -> { + String shortId = aliasToRegular.get(x); + Bcp47KeyInfo info = + subtypeToInfo.get(shortId); + System.out.println( + String.format( + "%s %s %s", + x, shortId, info)); + return !info.deprecated; + }) + .collect(Collectors.toUnmodifiableSet()); + } return result; } }); diff --git a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestExampleGenerator.java b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestExampleGenerator.java index 98001180d0f..14afd4f0b4c 100644 --- a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestExampleGenerator.java +++ b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestExampleGenerator.java @@ -840,14 +840,12 @@ public void TestFallbackFormat() { public void Test4897() { ExampleGenerator exampleGenerator = getExampleGenerator("it"); + final CLDRFile cldrFile = exampleGenerator.getCldrFile(); for (String xpath : With.in( - exampleGenerator - .getCldrFile() - .iterator( - "//ldml/dates/timeZoneNames", - exampleGenerator.getCldrFile().getComparator()))) { - String value = exampleGenerator.getCldrFile().getStringValue(xpath); + cldrFile.iterator( + "//ldml/dates/timeZoneNames", cldrFile.getComparator()))) { + String value = cldrFile.getStringValue(xpath); String actual = exampleGenerator.getExampleHtml(xpath, value); if (actual == null) { if (!xpath.contains("singleCountries") && !xpath.contains("gmtZeroFormat")) { diff --git a/tools/cldr-code/src/test/java/org/unicode/cldr/util/TestStandardCodes.java b/tools/cldr-code/src/test/java/org/unicode/cldr/util/TestStandardCodes.java index 0c3983a2b57..6cc91e02f20 100644 --- a/tools/cldr-code/src/test/java/org/unicode/cldr/util/TestStandardCodes.java +++ b/tools/cldr-code/src/test/java/org/unicode/cldr/util/TestStandardCodes.java @@ -3,6 +3,7 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; import java.util.Set; import org.junit.jupiter.api.Test; @@ -38,6 +39,7 @@ void testTargetCoverageLevel(final String locale, final String level) { void testTimezoneExclusions() { SupplementalDataInfo sdi = SupplementalDataInfo.getInstance(); Set timezones = sdi.getCLDRTimezoneCodes(); + assertTrue(timezones.contains("Europe/Andorra")); assertFalse(timezones.contains("America/Nipigon")); } }