From db1ba18f95a84518094d2f048a546baf4046118d Mon Sep 17 00:00:00 2001 From: Mark Davis Date: Mon, 2 Dec 2024 09:32:53 -0800 Subject: [PATCH] CLDR-18129 Investigate and fix (where necessary) invalid codes (#4215) --- common/dtd/ldml.dtd | 4 +- common/dtd/ldmlSupplemental.dtd | 26 +-- common/main/en.xml | 5 +- common/main/fi.xml | 2 +- common/main/la.xml | 10 +- common/main/nl.xml | 1 - common/supplemental/supplementalData.xml | 2 +- common/supplemental/supplementalMetadata.xml | 5 +- common/validity/language.xml | 4 +- exemplars/main/rna.xml | 23 --- .../org/unicode/cldr/util/MatchValue.java | 149 +++++++++++++++--- .../cldr/util/SupplementalDataInfo.java | 11 +- .../org/unicode/cldr/util/UnitConverter.java | 37 +++-- .../java/org/unicode/cldr/util/Validity.java | 9 +- .../cldr/unittest/TestAttributeValues.java | 34 ++-- .../org/unicode/cldr/unittest/TestBasic.java | 57 ++++++- .../cldr/unittest/TestSupplementalInfo.java | 45 ++++++ 17 files changed, 321 insertions(+), 103 deletions(-) delete mode 100644 exemplars/main/rna.xml diff --git a/common/dtd/ldml.dtd b/common/dtd/ldml.dtd index 984e03305a5..e02fff900ca 100644 --- a/common/dtd/ldml.dtd +++ b/common/dtd/ldml.dtd @@ -61,7 +61,7 @@ CLDR data files are interpreted according to the LDML specification (http://unic - + @@ -95,7 +95,7 @@ CLDR data files are interpreted according to the LDML specification (http://unic - + diff --git a/common/dtd/ldmlSupplemental.dtd b/common/dtd/ldmlSupplemental.dtd index 0ce28c9755e..237fb957063 100644 --- a/common/dtd/ldmlSupplemental.dtd +++ b/common/dtd/ldmlSupplemental.dtd @@ -65,7 +65,7 @@ CLDR data files are interpreted according to the LDML specification (http://unic - + @@ -113,7 +113,7 @@ CLDR data files are interpreted according to the LDML specification (http://unic - + @@ -284,7 +284,7 @@ CLDR data files are interpreted according to the LDML specification (http://unic - + @@ -297,7 +297,7 @@ CLDR data files are interpreted according to the LDML specification (http://unic - + @@ -702,7 +702,7 @@ CLDR data files are interpreted according to the LDML specification (http://unic - + @@ -711,7 +711,7 @@ CLDR data files are interpreted according to the LDML specification (http://unic - + @@ -720,9 +720,9 @@ CLDR data files are interpreted according to the LDML specification (http://unic - + - + @@ -738,7 +738,7 @@ CLDR data files are interpreted according to the LDML specification (http://unic - + @@ -914,7 +914,7 @@ CLDR data files are interpreted according to the LDML specification (http://unic - + @@ -962,9 +962,9 @@ CLDR data files are interpreted according to the LDML specification (http://unic - + - + @@ -996,7 +996,7 @@ CLDR data files are interpreted according to the LDML specification (http://unic - + diff --git a/common/main/en.xml b/common/main/en.xml index 70918a060d0..29542a24576 100644 --- a/common/main/en.xml +++ b/common/main/en.xml @@ -139,6 +139,7 @@ annotations. Coptic Capiznon Cree + Woods Cree Michif Crimean Tatar Southern East Cree @@ -152,7 +153,6 @@ annotations. Swampy Cree Church Slavic Chuvash - Woods Cree Welsh Danish Dakota @@ -256,7 +256,6 @@ annotations. Hakka Chinese Hawaiian Southern Haida - Northern Haida Hebrew Hindi Hindi (Latin) @@ -284,7 +283,6 @@ annotations. Igbo Sichuan Yi Inupiaq - Eastern Canadian Inuktitut Western Canadian Inuktitut Iloko Ingush @@ -474,7 +472,6 @@ annotations. Ojibwa Northwestern Ojibwa Central Ojibwa - Eastern Ojibwa Oji-Cree Western Ojibwa Okanagan diff --git a/common/main/fi.xml b/common/main/fi.xml index c56be83d76c..7bbe4e1dec6 100644 --- a/common/main/fi.xml +++ b/common/main/fi.xml @@ -31,7 +31,7 @@ Warnings: All cp values have U+FE0F characters removed. See /annotationsDerived/ afrihili aghem ainu - urduni + urduni akan akkadi alabama diff --git a/common/main/la.xml b/common/main/la.xml index ced058291d1..40a5e6ff4f8 100644 --- a/common/main/la.xml +++ b/common/main/la.xml @@ -24,7 +24,6 @@ CLDR data files are interpreted according to the LDML specification (http://unic Atropatenica Ruthenica Alba Bulgarica - Bihari Bengalica Tibetana Britonica @@ -66,12 +65,12 @@ CLDR data files are interpreted according to the LDML specification (http://unic Interlingua Interlingue Igbonica - Indonesia + Indonesia Islandica Italiana - Hebraica + Hebraica Iaponica - Iudaeogermanica + Iudaeogermanica Iavensis Georgiana Cazachica @@ -213,7 +212,6 @@ CLDR data files are interpreted according to the LDML specification (http://unic Brasilia Insulae Bahamenses Butania - Birmania Insula Bouvet Botswana Ruthenia Alba @@ -237,7 +235,6 @@ CLDR data files are interpreted according to the LDML specification (http://unic Insula Christi Natalis Cyprus Cechia - Res publica Democratica Germanica Germania Gibutum Dania @@ -421,7 +418,6 @@ CLDR data files are interpreted according to the LDML specification (http://unic Kosovia Iemenia Maiotta - Iugoslavia Africa Australis Zambia Zimbabua diff --git a/common/main/nl.xml b/common/main/nl.xml index b73d6893e20..a8e9fa1d1ba 100644 --- a/common/main/nl.xml +++ b/common/main/nl.xml @@ -31,7 +31,6 @@ Warnings: All cp values have U+FE0F characters removed. See /annotationsDerived/ Afrihili Aghem Aino - Zuid-Levantijns-Arabisch Akan Akkadisch Alabama diff --git a/common/supplemental/supplementalData.xml b/common/supplemental/supplementalData.xml index b963f1466e6..58893bc9318 100644 --- a/common/supplemental/supplementalData.xml +++ b/common/supplemental/supplementalData.xml @@ -4930,7 +4930,7 @@ XXX Code for transations where no currency is involved - + diff --git a/common/supplemental/supplementalMetadata.xml b/common/supplemental/supplementalMetadata.xml index 7b3f94549a4..d8e5052adef 100644 --- a/common/supplemental/supplementalMetadata.xml +++ b/common/supplemental/supplementalMetadata.xml @@ -179,7 +179,7 @@ For terms of use, see http://www.unicode.org/copyright.html - + @@ -306,6 +306,9 @@ For terms of use, see http://www.unicode.org/copyright.html + + + diff --git a/common/validity/language.xml b/common/validity/language.xml index bfe60a517f1..912dac91d86 100644 --- a/common/validity/language.xml +++ b/common/validity/language.xml @@ -76,7 +76,7 @@ cia~e cih cik cim~n cip cir ciw ciy cja cje cjh~i cjk cjm~p cjs cjv cjy ckb ckh ckl~o ckq~v ckx~z - cla clc cle clh~m clo cls~u clw cly + cla clc cle clh~m clo clt~u clw cly cma cmc cme cmg cmi cml~m cmo cmr~t cna~c cng~i cnk~l cno~q cns~u cnw~x co coa~h coj~q cot~x coz @@ -628,7 +628,7 @@ aam adp agp ais ajp ajt~u als aoh arb asd aue ayr ayx~y azj baz bbz bcc bcl bgm bh bhk bic bij bjd bjq bkb blg bmy bpb btb btl bxk bxr bxx byy - cbe cbh cca ccq cdg cjr cka cld cmk cmn cnr coy cqu cug cum cwd + cbe cbh cca ccq cdg cjr cka cld cls cmk cmn cnr coy cqu cug cum cwd daf dap dgo dgu dha dhd dik diq dit djl dkl drh drr drw dud duj dwl ekc ekk elp emk emo esk fat fuc diff --git a/exemplars/main/rna.xml b/exemplars/main/rna.xml deleted file mode 100644 index de04a86b277..00000000000 --- a/exemplars/main/rna.xml +++ /dev/null @@ -1,23 +0,0 @@ - - - - - - - - - - - left-to-right - top-to-bottom - - - - [a b c d e f g h i j k m n o p r s t u v w y z] - [l q x] - - diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/util/MatchValue.java b/tools/cldr-code/src/main/java/org/unicode/cldr/util/MatchValue.java index 973186181c6..c0c9f17357e 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/util/MatchValue.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/util/MatchValue.java @@ -4,7 +4,10 @@ import com.google.common.base.Splitter; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableMultimap; import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Multimap; +import com.google.common.collect.TreeMultimap; import com.ibm.icu.impl.Relation; import com.ibm.icu.impl.Row; import com.ibm.icu.impl.Row.R2; @@ -17,6 +20,7 @@ import com.vdurmont.semver4j.Semver.SemverType; import com.vdurmont.semver4j.SemverException; import java.text.ParseException; +import java.util.Collections; import java.util.Date; import java.util.EnumSet; import java.util.HashMap; @@ -104,8 +108,9 @@ public static MatchValue of(String command) { throw new IllegalArgumentException( "Illegal/Unimplemented match type: " + originalArg); } + // check for errors in the MatchValue functions if (!originalArg.equals(result.getName())) { - System.err.println( + throw new IllegalArgumentException( "Non-standard form or error: " + originalArg + " ==> " + result.getName()); } return result; @@ -157,14 +162,22 @@ public static class LocaleMatchValue extends MatchValue { private final Predicate variant; public LocaleMatchValue() { - this(null); + this(null, null, null, null); // use default status } public LocaleMatchValue(Set statuses) { - lang = new ValidityMatchValue(LstrType.language, statuses, false); - script = new ValidityMatchValue(LstrType.script, statuses, false); - region = new ValidityMatchValue(LstrType.region, statuses, false); - variant = new ValidityMatchValue(LstrType.variant, statuses, false); + this(statuses, statuses, statuses, statuses); + } + + public LocaleMatchValue( + Set langStatus, + Set scriptStatus, + Set regionStatus, + Set variantStatus) { + lang = new ValidityMatchValue(LstrType.language, langStatus, false); + script = new ValidityMatchValue(LstrType.script, scriptStatus, false); + region = new ValidityMatchValue(LstrType.region, regionStatus, false); + variant = new ValidityMatchValue(LstrType.variant, variantStatus, false); } @Override @@ -174,8 +187,11 @@ public String getName() { @Override public boolean is(String item) { + if (item.equals("root")) { + item = "und"; + } if (!item.contains("_")) { - return lang.is(item); + return checkLang(item); } LanguageTagParser ltp; try { @@ -183,7 +199,7 @@ public boolean is(String item) { } catch (Exception e) { return false; } - return lang.is(ltp.getLanguage()) + return checkLang(ltp.getLanguage()) && (ltp.getScript().isEmpty() || script.is(ltp.getScript())) && (ltp.getRegion().isEmpty() || region.is(ltp.getRegion())) && (ltp.getVariants().isEmpty() || and(variant, ltp.getVariants())) @@ -191,12 +207,54 @@ public boolean is(String item) { && ltp.getLocaleExtensions().isEmpty(); } + public boolean checkLang(String language) { + return lang.is(language); + } + @Override public String getSample() { return "de"; } } + /** + * Check for the language OR certain backwards-compatible exceptions for data to support + * retaining variants, namely likelySubtags: "in","iw","ji","jw","mo","tl" + */ + public static class XLocaleMatchValue extends LocaleMatchValue { + static final Set exceptions = Set.of("in", "iw", "ji", "jw", "mo", "tl"); + + @Override + public boolean checkLang(String language) { + return super.checkLang(language) // first check normal + || exceptions.contains(language); + } + + @Override + public String getName() { + return "validity/locale-for-likely"; + } + } + + /** + * Check for the language OR certain backwards-compatible exceptions for language names: "fat", + * "sh", "tl", "tw" + */ + public static class NLocaleMatchValue extends LocaleMatchValue { + static final Set exceptions = Set.of("fat", "sh", "tl", "tw"); + + @Override + public boolean checkLang(String language) { + return super.checkLang(language) // first check normal + || exceptions.contains(language); + } + + @Override + public String getName() { + return "validity/locale-for-names"; + } + } + // TODO remove these if possible — ticket/10120 static final Set SCRIPT_HACK = ImmutableSet.of( @@ -253,6 +311,9 @@ public static EnumParser of(Class aClass) { } public Set parse(String text) { + if (text == null) { + return null; + } Set statuses = EnumSet.noneOf(aClass); boolean negative = text.startsWith("!"); if (negative) { @@ -293,18 +354,59 @@ public boolean isAll(Set statuses) { } public static class ValidityMatchValue extends MatchValue { + private static final Validity VALIDITY = Validity.getInstance(); + public static final Multimap DEFAULT_STATUS; + + static { + Multimap DEFAULT_STATUS_ = TreeMultimap.create(); + for (LstrType lstrType : LstrType.values()) { + switch (lstrType) { + case region: + DEFAULT_STATUS_.putAll( + lstrType, + Set.of( + Status.regular, + Status.unknown, + Status.macroregion, + Status.special)); + break; + case language: + case script: + DEFAULT_STATUS_.putAll( + lstrType, Set.of(Status.regular, Status.unknown, Status.special)); + break; + case subdivision: + case currency: + DEFAULT_STATUS_.putAll( + lstrType, + Set.of(Status.regular, Status.unknown, Status.deprecated)); + break; + default: + DEFAULT_STATUS_.putAll(lstrType, Set.of(Status.regular, Status.unknown)); + break; + } + } + DEFAULT_STATUS = ImmutableMultimap.copyOf(DEFAULT_STATUS_); + } + + private static Map shortCodeToStatus; + private static final EnumParser validityStatusParser = EnumParser.of(Status.class); + private final LstrType type; private final boolean shortId; private final Set statuses; - private static Map shortCodeToStatus; - private static final EnumParser enumParser = EnumParser.of(Status.class); @Override public String getName() { + Collections a; return "validity/" + (shortId ? "short-" : "") + type.toString() - + (enumParser.isAll(statuses) ? "" : "/" + enumParser.format(statuses)); + + (statuses.equals(Set.copyOf(DEFAULT_STATUS.get(type))) + ? "" + : statuses.equals(VALIDITY.getStatusToCodes(type).keySet()) + ? "/all" + : "/" + validityStatusParser.format(statuses)); } private ValidityMatchValue(LstrType type) { @@ -317,21 +419,28 @@ private ValidityMatchValue(LstrType type, Set statuses, boolean shortId) throw new IllegalArgumentException("short- not supported except for units"); } this.shortId = shortId; + // validForType = Validity.getInstance().getStatusToCodes(type).keySet(); this.statuses = - statuses == null ? EnumSet.allOf(Status.class) : ImmutableSet.copyOf(statuses); + ImmutableSet.copyOf(statuses == null ? DEFAULT_STATUS.get(type) : statuses); } public static MatchValue of(String typeName) { if (typeName.equals("locale")) { return new LocaleMatchValue(); } + if (typeName.equals("locale-for-likely")) { + return new XLocaleMatchValue(); + } + if (typeName.equals("locale-for-names")) { + return new NLocaleMatchValue(); + } if (typeName.equals("bcp47-wellformed")) { return new BCP47LocaleWellFormedMatchValue(); } + String statusPart = null; int slashPos = typeName.indexOf('/'); - Set statuses = null; if (slashPos > 0) { - statuses = enumParser.parse(typeName.substring(slashPos + 1)); + statusPart = typeName.substring(slashPos + 1); typeName = typeName.substring(0, slashPos); } boolean shortId = typeName.startsWith("short-"); @@ -339,6 +448,10 @@ public static MatchValue of(String typeName) { typeName = typeName.substring(6); } LstrType type = LstrType.fromString(typeName); + Set statuses = + "all".equals(statusPart) + ? VALIDITY.getStatusToCodes(type).keySet() + : validityStatusParser.parse(statusPart); return new ValidityMatchValue(type, statuses, shortId); } @@ -366,9 +479,7 @@ public boolean is(String item) { == null) { // lazy evaluation to avoid circular dependencies Map _shortCodeToStatus = new TreeMap<>(); for (Entry entry : - Validity.getInstance() - .getCodeToStatus(LstrType.unit) - .entrySet()) { + VALIDITY.getCodeToStatus(LstrType.unit).entrySet()) { String key = entry.getKey(); Status status = entry.getValue(); final String shortKey = key.substring(key.indexOf('-') + 1); @@ -389,13 +500,13 @@ public boolean is(String item) { default: break; } - final Status status = Validity.getInstance().getCodeToStatus(type).get(item); + final Status status = VALIDITY.getCodeToStatus(type).get(item); return status != null && statuses.contains(status); } @Override public String getSample() { - return Validity.getInstance().getCodeToStatus(type).keySet().iterator().next(); + return VALIDITY.getCodeToStatus(type).keySet().iterator().next(); } } diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/util/SupplementalDataInfo.java b/tools/cldr-code/src/main/java/org/unicode/cldr/util/SupplementalDataInfo.java index 2caae47d1b2..a298de5c959 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/util/SupplementalDataInfo.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/util/SupplementalDataInfo.java @@ -1312,7 +1312,7 @@ private void makeStuffSafe() { if (unitAliases != null) { // don't load unless the information is there (for old releases); unitConverter.addAliases(unitAliases); } - unitConverter.freeze(); + unitConverter.freeze(new File(directory, "../validity").toString()); rationalParser.freeze(); unitPreferences.freeze(); @@ -2166,8 +2166,13 @@ private boolean handleMetadata(String level2, String value, XPathValue parts) { } return true; } else if (level3.equals("attributeValues")) { - AttributeValidityInfo.add( - parts.getAttributes(-1), value, attributeValidityInfo); + // the keyboard directory disappeared in new versions. + // supplementalData/metadata/validity/attributeValues[@dtds="keyboard"][@elements="keyMap"][@attributes="modifiers"][@type="TODO"] + final String dtdsValue = parts.getAttributeValue(-1, "dtds"); + if (!"keyboard".equals(dtdsValue) && !"platform".equals(dtdsValue)) { + AttributeValidityInfo.add( + parts.getAttributes(-1), value, attributeValidityInfo); + } return true; } } else if (level2.equals("serialElements")) { diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/util/UnitConverter.java b/tools/cldr-code/src/main/java/org/unicode/cldr/util/UnitConverter.java index 789350e075d..482dff86b1c 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/util/UnitConverter.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/util/UnitConverter.java @@ -168,6 +168,10 @@ public boolean isFrozen() { @Override public UnitConverter freeze() { + return freeze(CLDRPaths.VALIDITY_DIRECTORY); + } + + public UnitConverter freeze(String validityDirectory) { if (!frozen) { frozen = true; rationalParser.freeze(); @@ -185,7 +189,7 @@ public UnitConverter freeze() { baseUnits = builder.build(); targetInfoComparator = new TargetInfoComparator(); - buildMapComparators(); + buildMapComparators(validityDirectory); // must be after building comparators idToUnitId = ImmutableMap.copyOf(buildIdToUnitId()); @@ -194,14 +198,19 @@ public UnitConverter freeze() { } public void buildMapComparators() { + buildMapComparators(CLDRPaths.VALIDITY_DIRECTORY); + } + + public void buildMapComparators(String validityDirectory) { Set> all = new TreeSet<>(); + final Validity validity = Validity.getInstance(validityDirectory); Set baseSeen = new HashSet<>(); + if (DEBUG) { UnitParser up = new UnitParser(componentTypeData); Output uict = new Output<>(); - for (String longUnit : - Validity.getInstance().getStatusToCodes(LstrType.unit).get(Status.regular)) { + for (String longUnit : validity.getStatusToCodes(LstrType.unit).get(Status.regular)) { String shortUnit = getShortId(longUnit); up.set(shortUnit); List items = new ArrayList<>(); @@ -219,8 +228,7 @@ public void buildMapComparators() { System.out.println(shortUnit + "\t" + Joiner.on('\t').join(items)); } } - for (String longUnit : - Validity.getInstance().getStatusToCodes(LstrType.unit).get(Status.regular)) { + for (String longUnit : validity.getStatusToCodes(LstrType.unit).get(Status.regular)) { Output base = new Output<>(); String shortUnit = getShortId(longUnit); ConversionInfo conversionInfo = parseUnitId(shortUnit, base, false); @@ -232,18 +240,23 @@ public void buildMapComparators() { conversionInfo = parseUnitId("kelvin", base, false); } } - String quantity; + String quantity = null; Integer quantityNumericOrder = null; try { quantity = getQuantityFromUnit(base.value, false); + if (quantity == null && "beaufort".equals(shortUnit)) { + quantity = "speed"; + } quantityNumericOrder = quantityComparator.getNumericOrder(quantity); } catch (Exception e) { System.out.println( - "Failed " + "Failed to build unit comparator for " + shortUnit + ", " + base + ", " + + quantity + + ", " + quantityNumericOrder + ", " + e); @@ -284,7 +297,11 @@ public void buildMapComparators() { "Add new unitSystem to a grouping: " + sortingSystem); } R4 sortKey = - Row.of(quantityNumericOrder, sortingSystem, conversionInfo.factor, shortUnit); + Row.of( + quantityNumericOrder, + sortingSystem, + conversionInfo == null ? Rational.INFINITY : conversionInfo.factor, + shortUnit); all.add(sortKey); } LongUnitIdOrder.setErrorOnMissing(true); @@ -1882,16 +1899,16 @@ public BiMap getBaseUnitToQuantity() { return (BiMap) baseUnitToQuantity; } + /** Returns null if unit can't be parsed */ public String getQuantityFromUnit(String unit, boolean showYourWork) { Output metricUnit = new Output<>(); unit = fixDenormalized(unit); try { ConversionInfo unitInfo = parseUnitId(unit, metricUnit, showYourWork); - return metricUnit.value == null ? null : getQuantityFromBaseUnit(metricUnit.value); } catch (Exception e) { - System.out.println("Failed with " + unit + ", " + metricUnit + "\t" + e); return null; } + return metricUnit.value == null ? null : getQuantityFromBaseUnit(metricUnit.value); } public String getQuantityFromBaseUnit(String baseUnit) { diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/util/Validity.java b/tools/cldr-code/src/main/java/org/unicode/cldr/util/Validity.java index 358fc099df8..ca56301fc01 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/util/Validity.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/util/Validity.java @@ -2,7 +2,9 @@ import com.google.common.base.Splitter; import com.google.common.collect.ImmutableSet; +import com.ibm.icu.util.ICUUncheckedIOException; import java.io.File; +import java.io.IOException; import java.util.ArrayList; import java.util.EnumMap; import java.util.LinkedHashMap; @@ -38,6 +40,11 @@ public static Validity getInstance() { } public static Validity getInstance(String validityDirectory) { + try { + validityDirectory = new File(validityDirectory).getCanonicalFile().toString(); + } catch (IOException e) { + throw new ICUUncheckedIOException(e); + } Validity result = cache.get(validityDirectory); if (result == null) { final Validity value = new Validity(validityDirectory); @@ -79,7 +86,7 @@ private Validity(String validityDirectory) { codeToStatus.put(type, subCodeToStatus = new LinkedHashMap<>()); } - XMLFileReader.loadPathValues(basePath + file, lineData, true); + XMLFileReader.loadPathValues(new File(basePath, file).toString(), lineData, true); for (Pair item : lineData) { XPathValue parts = SimpleXPathParts.getFrozenInstance(item.getFirst()); if (!"id".equals(parts.getElement(-1))) { diff --git a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestAttributeValues.java b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestAttributeValues.java index a288cafb6b5..e2bf827b68c 100644 --- a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestAttributeValues.java +++ b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestAttributeValues.java @@ -92,7 +92,7 @@ public void TestValid() { addXMLFiles(dtdType, mainDirs + stringDir, files); if (isVerbose()) synchronized (pathChecker.testLog) { - warnln(mainDirs + stringDir); + logln(mainDirs + stringDir); } } Stream stream = SERIAL ? files.stream() : files.parallelStream(); @@ -102,7 +102,7 @@ public void TestValid() { // checkFile(pathChecker, file); // } } - pathChecker.show(isVerbose(), showStatuses); + pathChecker.show(dtdType, isVerbose(), showStatuses); } // List localesToTest = Arrays.asList("en", "root"); // , "zh", "hi", "ja", // "ru", "cy" @@ -145,9 +145,9 @@ private void addXMLFiles(DtdType dtdType, String path, Set files) { } else { for (String file : dirFile.list()) { String localeID = file.replace(".xml", ""); - if (StandardCodes.isLocaleAtLeastBasic(localeID)) { - addXMLFiles(dtdType, path + "/" + file, files); - } + // if (StandardCodes.isLocaleAtLeastBasic(localeID)) { + addXMLFiles(dtdType, path + "/" + file, files); + // } } } } @@ -186,7 +186,8 @@ private void checkFile(PathChecker pathChecker, String fullFile) { ++_attributeCount; String attribute = r.getAttributeLocalName(i); String attributeValue = r.getAttributeValue(i); - pathChecker.checkAttribute(element, attribute, attributeValue); + pathChecker.checkAttribute( + fullFile, element, attribute, attributeValue); } break; } @@ -237,7 +238,7 @@ public PathChecker(TestFmwk testLog, DtdData dtdData) { matchValues = ImmutableMap.copyOf(_matchValues); } - private void checkPath(String path) { + private void checkPath(String fullFile, String path) { if (seen.contains(path)) { return; } @@ -251,19 +252,20 @@ private void checkPath(String path) { for (Entry entry : parts.getAttributes(elementIndex).entrySet()) { String attribute = entry.getKey(); String attrValue = entry.getValue(); - checkAttribute(element, attribute, attrValue); + checkAttribute(fullFile, element, attribute, attrValue); } } } - public void checkElement(String element, Attributes atts) { + public void checkElement(String fullFile, String element, Attributes atts) { int length = atts.getLength(); for (int i = 0; i < length; ++i) { - checkAttribute(element, atts.getQName(i), atts.getValue(i)); + checkAttribute(fullFile, element, atts.getQName(i), atts.getValue(i)); } } - private void checkAttribute(String element, String attribute, String attrValue) { + private void checkAttribute( + String fullFile, String element, String attribute, String attrValue) { // skip cases we know we don't need to test if (!needsTesting.containsEntry(element, attribute)) { return; @@ -296,16 +298,18 @@ private void checkAttribute(String element, String attribute, String attrValue) // Set breakpoint here for debugging (referenced from // http://cldr.unicode.org/development/testattributevalues) dtdData.getValueStatus(element, attribute, attrValue); + testLog.warnln( + Joiner.on('\t').join("Invalid", fullFile, element, attribute, attrValue)); } synchronized (valueStatusInfo) { valueStatusInfo.put(valueStatus, element, attribute, attrValue, Boolean.TRUE); } } - void show(boolean verbose, ImmutableSet retain) { + void show(DtdType dtdType, boolean verbose, ImmutableSet retain) { if (dtdData.dtdType == DtdType.keyboard3 && testLog.logKnownIssue("CLDR-14974", "skipping for keyboard")) { - testLog.warnln("Skipping for keyboard3"); + testLog.warnln("keyboard3 is missing validity checks"); } boolean haveProblems = false; for (ValueStatus valueStatus : ValueStatus.values()) { @@ -323,7 +327,9 @@ void show(boolean verbose, ImmutableSet retain) { } StringBuilder out = new StringBuilder(); out.append( - "\nIf the test fails, look at https://cldr.unicode.org/development/cldr-development-site/testattributevalues\n"); + "For " + + dtdType.directories + + "\nIf the test fails, use -v for details. Also look at https://cldr.unicode.org/development/updating-codes/testattributevalues for guidance.\n"); out.append("file\tCount:\t" + dtdData.dtdType + "\t" + fileCount + "\n"); out.append("element\tCount:\t" + dtdData.dtdType + "\t" + elementCount + "\n"); diff --git a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestBasic.java b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestBasic.java index c21c6c5689c..9a3f3773249 100644 --- a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestBasic.java +++ b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestBasic.java @@ -5,6 +5,8 @@ import com.google.common.collect.ImmutableMultimap; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Multimap; +import com.google.common.collect.Sets; +import com.google.common.collect.Sets.SetView; import com.google.common.collect.TreeMultimap; import com.ibm.icu.impl.Relation; import com.ibm.icu.impl.Row; @@ -41,6 +43,7 @@ import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; +import java.util.stream.Collectors; import org.unicode.cldr.test.DisplayAndInputProcessor; import org.unicode.cldr.tool.CldrVersion; import org.unicode.cldr.tool.LikelySubtags; @@ -195,7 +198,7 @@ private void checkDtds( } else if (fileName.getPath().contains("/keyboards/3.0/") && logKnownIssue( "CLDR-17574", "With v46, parsing issues for keyboard xml files")) { - ; // do nothing, skip test + // do nothing, skip test } else if (name.endsWith(".xml")) { data.add(check(fileName)); if (deepCheck // takes too long to do all the time @@ -1652,5 +1655,57 @@ public void sortPaths(Comparator dc, Collection paths) { public void sortPaths(Comparator dc, String... array) { Arrays.sort(array, 0, array.length, dc); } + // public void TestNewDtdData() moved to TestDtdData + + public void testBcp47Ids() { + if (!TestCLDRPaths.canUseArchiveDirectory()) { + return; + } + final File ARCHIVE = new File(CLDRPaths.ARCHIVE_DIRECTORY); + Set> seen = new LinkedHashSet<>(); + + // get the archive directories in reverse order (latest first) + + TreeSet sortedArchiveDirectories = new TreeSet<>(Collections.reverseOrder()); + sortedArchiveDirectories.addAll(Arrays.asList(ARCHIVE.listFiles())); + + // get the BCP 47 keys to test against + + Set> newKeys = pairs(SUPPLEMENTAL_DATA_INFO.getBcp47Keys()); + + for (File file : sortedArchiveDirectories) { + if (!file.getName().startsWith("cldr-")) { + continue; + } + if (file.getName().compareTo("cldr-44.0") < 0) { + break; + } + logln(file.toString()); + File supplementalDir = new File(file, "common/supplemental"); + SupplementalDataInfo otherSupplementalData; + try { + otherSupplementalData = SupplementalDataInfo.getInstance(supplementalDir); + } catch (RuntimeException e) { + errln("Can't create SupplementalDataInfo for " + supplementalDir); + throw e; + // continue; + } + Set> oldKeys = pairs(otherSupplementalData.getBcp47Keys()); + if (!newKeys.containsAll(oldKeys)) { + SetView> oldButNotNew = Sets.difference(oldKeys, newKeys); + SetView> oldButNotNewMinusSeen = + Sets.difference(oldButNotNew, seen); + if (!assertEquals(file.toString(), Collections.emptySet(), oldButNotNewMinusSeen)) { + seen.addAll(oldButNotNewMinusSeen); + } + } + } + } + + private Set> pairs(Relation bcp47Keys) { + return bcp47Keys.entrySet().stream() + .map(x -> Pair.of(x.getKey(), x.getValue())) + .collect(Collectors.toCollection(TreeSet::new)); + } } diff --git a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestSupplementalInfo.java b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestSupplementalInfo.java index fcf753bbccb..87ab3b0780f 100644 --- a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestSupplementalInfo.java +++ b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestSupplementalInfo.java @@ -78,6 +78,7 @@ import org.unicode.cldr.util.PreferredAndAllowedHour.HourStyle; import org.unicode.cldr.util.StandardCodes; import org.unicode.cldr.util.StandardCodes.CodeType; +import org.unicode.cldr.util.StandardCodes.LstrField; import org.unicode.cldr.util.StandardCodes.LstrType; import org.unicode.cldr.util.SupplementalDataInfo; import org.unicode.cldr.util.SupplementalDataInfo.BasicLanguageData; @@ -2176,4 +2177,48 @@ public void TestGrammarInfo() { } } } + + public void testPredominantEncompassed() { + // maybe check with lstreg instead? They should be in sync. + Map>> lstreg = StandardCodes.getEnumLstreg(); + + SupplementalDataInfo supp = SupplementalDataInfo.getInstance(); + // Returns type -> tag -> , like "language" -> "sh" -> <{"sr_Latn"}, reason> + Map, String>>> locAliases = supp.getLocaleAliasInfo(); + Map, String>> langAliases = locAliases.get("language"); + Set skip = Set.of("no", "sh"); + + Iso639Data.getNames("a"); // init (need to fix) + + Set macros = Iso639Data.getMacros(); + main: + for (String macro : macros) { + if (skip.contains(macro)) { + continue; + } + Set encompasseds = Iso639Data.getEncompassedForMacro(macro); + final List encompassedNames = + encompasseds.stream().map(x -> codeAndName(x)).collect(Collectors.toList()); + for (String encompassed : encompasseds) { + R2, String> data = langAliases.get(encompassed); + if (data != null) { + if (data.get0().contains(macro)) { + logln( + codeAndName(macro) + + "has predominant " + + codeAndName(encompassed) + + " in encompassed: " + + encompassedNames); + continue main; + } + } + } + errln("ERROR " + codeAndName(macro) + " missing predominent from " + encompassedNames); + } + } + + private String codeAndName(String macro) { + // TODO Auto-generated method stub + return CLDRConfig.getInstance().getEnglish().getName(macro) + " (" + macro + ")"; + } }