CLDR-15954 Clean up generation software, regenerate the tests, and fi…

…x the logical-group error
unicode-org · Mar 18, 2024 · 2884647 · 2884647
1 parent 16131bb
commit 2884647
Show file tree

Hide file tree

Showing 5 changed files with 148 additions and 120 deletions.
diff --git a/common/testData/units/unitLocalePreferencesTest.txt b/common/testData/units/unitLocalePreferencesTest.txt
@@ -1,4 +1,3 @@
-
 # Test data for unit locale preferences
 #  Copyright © 1991-2024 Unicode, Inc.
 #  For terms of use, see http://www.unicode.org/copyright.html
@@ -16,24 +15,24 @@
 #
 #	 The input and expected output units are unit identifers; in particular, the output does not have further processing:
 #		 • no localization
-
-fahrenheit;	1;	default;	en-u-rg-uszzzz-ms-ussystem-mu-celsius;	celsius;	-155/9 # mu > ms > rg > (likely) region
+#
+fahrenheit;	1;	default;	en-u-rg-uszzzz-ms-ussystem-mu-celsius;	celsius;	-155/9	# mu > ms > rg > (likely) region
 fahrenheit;	1;	default;	en-u-rg-uszzzz-ms-ussystem-mu-celsius;	celsius;	-155/9
 fahrenheit;	1;	default;	en-u-rg-uszzzz-ms-metric;	celsius;	-155/9
 fahrenheit;	1;	default;	en-u-rg-dezzzz;	celsius;	-155/9
-fahrenheit;	1;	default;	en-DE;	celsius;	-155/9 # explicit region > likely region
+fahrenheit;	1;	default;	en-DE;	celsius;	-155/9	# explicit region > likely region
 fahrenheit;	1;	default;	en-US;	fahrenheit;	1
-fahrenheit;	1;	default;	en;	fahrenheit;	1 # likely region = US
+fahrenheit;	1;	default;	en;	fahrenheit;	1	# likely region = US
 gallon-imperial;	2.5;	fluid;	en-u-rg-uszzzz-ms-metric;	liter;	11.365225
 gallon-imperial;	2.5;	fluid;	en-u-rg-dezzzz;	liter;	11.365225
 gallon-imperial;	2.5;	fluid;	en-DE;	liter;	11.365225
-gallon-imperial;	2.5;	fluid;	en-US-u-rg-uszzzz-ms-uksystem;	gallon-imperial;	2.5 # ms-uksystem should behave like GB
+gallon-imperial;	2.5;	fluid;	en-US-u-rg-uszzzz-ms-uksystem;	gallon-imperial;	2.5	# ms-uksystem should behave like GB
 gallon-imperial;	2.5;	fluid;	en-u-rg-gbzzzz;	gallon-imperial;	2.5
 gallon-imperial;	2.5;	fluid;	en-GB;	gallon-imperial;	2.5
 gallon-imperial;	2.5;	fluid;	en-u-rg-uszzzz-ms-ussystem;	gallon;	1,420,653,125/473176473
 gallon-imperial;	2.5;	fluid;	en-u-rg-uszzzz;	gallon;	1,420,653,125/473176473
 gallon-imperial;	2.5;	fluid;	en-US;	gallon;	1,420,653,125/473176473
-gallon-imperial;	2.5;	fluid;	en;	gallon;	1,420,653,125/473176473 # likely region = US
+gallon-imperial;	2.5;	fluid;	en;	gallon;	1,420,653,125/473176473	# likely region = US
 ampere;	2.5;	default;	en;	ampere;	2.5	# an input unit whose quantity has no preference data should get base units
 pound-force-foot;	12,345;	default;	en;	kilowatt-hour;	0.004649325714486427205
 kilocandela;	1;	default;	en;	candela;	1,000	# an input unit whose quantity has no preference data should get base units

diff --git a/common/testData/units/unitPreferencesTest.txt b/common/testData/units/unitPreferencesTest.txt
@@ -2,7 +2,7 @@
 # Test data for unit preferences
 #  Copyright © 1991-2024 Unicode, Inc.
 #  For terms of use, see http://www.unicode.org/copyright.html
-#  SPDX-License-Identifier: Unicode-DFS-2016
+#  SPDX-License-Identifier: Unicode-3.0
 #  CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/)
 #
 # Format:
@@ -22,7 +22,7 @@
 #		 • no formatted with the skeleton
 #		 • no suppression of zero values (for secondary -and- units such as pound in stone-and-pound)
 #
-# Generation: Set GENERATE_TESTS in TestUnits.java to regenerate unitPreferencesTest.txt.
+# Generation: Use GenerateUnitTestData.java to regenerate unitPreferencesTest.txt.
 
 area;	default;	001;	1100000;	1100000.0;	square-meter;	11/10;	1.1;	square-kilometer
 area;	default;	001;	1000000;	1000000.0;	square-meter;	1;	1.0;	square-kilometer

diff --git a/common/testData/units/unitsTest.txt b/common/testData/units/unitsTest.txt
@@ -1,7 +1,7 @@
 # Test data for unit conversions
 #  Copyright © 1991-2024 Unicode, Inc.
 #  For terms of use, see http://www.unicode.org/copyright.html
-#  SPDX-License-Identifier: Unicode-DFS-2016
+#  SPDX-License-Identifier: Unicode-3.0
 #  CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/)
 #
 # Format:
@@ -12,7 +12,7 @@
 #   round to 4 decimal digits before comparing.
 # Note that certain conversions are approximate, such as degrees to radians
 #
-# Generation: Set GENERATE_TESTS in TestUnits.java to regenerate unitsTest.txt.
+# Generation: Use GenerateUnitTestData.java to regenerate unitsTest.txt.
 
 acceleration	;	meter-per-square-second	;	meter-per-square-second	;	1 * x	;	1,000.00
 acceleration	;	g-force	;	meter-per-square-second	;	9.80665 * x	;	9806.65

diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateUnitTestData.java b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateUnitTestData.java
@@ -1,34 +1,35 @@
 package org.unicode.cldr.tool;
 
+import com.google.common.base.Splitter;
 import com.google.common.collect.ImmutableMap;
 import com.google.common.collect.ImmutableMultimap;
 import com.google.common.collect.ImmutableSet;
 import com.google.common.collect.Multimap;
 import com.google.common.collect.TreeMultimap;
-import com.ibm.icu.number.LocalizedNumberFormatter;
-import com.ibm.icu.number.NumberFormatter;
-import com.ibm.icu.number.NumberFormatter.UnitWidth;
-import com.ibm.icu.number.UnlocalizedNumberFormatter;
-import com.ibm.icu.util.Measure;
-import com.ibm.icu.util.MeasureUnit;
 import com.ibm.icu.util.Output;
+import com.ibm.icu.util.ULocale;
+import java.io.IOException;
+import java.io.UncheckedIOException;
 import java.math.BigInteger;
 import java.math.MathContext;
+import java.nio.file.Files;
+import java.nio.file.Path;
 import java.util.Collection;
 import java.util.Comparator;
+import java.util.HashSet;
 import java.util.LinkedHashSet;
 import java.util.List;
-import java.util.Locale;
 import java.util.Map;
 import java.util.Map.Entry;
 import java.util.Set;
 import java.util.TreeMap;
 import java.util.TreeSet;
-import org.unicode.cldr.util.CLDRLocale;
+import java.util.regex.Pattern;
 import org.unicode.cldr.util.CLDRPaths;
 import org.unicode.cldr.util.CldrUtility;
 import org.unicode.cldr.util.Pair;
 import org.unicode.cldr.util.Rational;
+import org.unicode.cldr.util.Rational.FormatStyle;
 import org.unicode.cldr.util.StandardCodes.LstrType;
 import org.unicode.cldr.util.SupplementalDataInfo;
 import org.unicode.cldr.util.TempPrintWriter;
@@ -55,7 +56,7 @@ public static void main(String[] args) {
         GenerateUnitTestData item = new GenerateUnitTestData();
         item.TestParseUnit();
         item.TestUnitPreferences();
-        item.testIcu();
+        item.generateUnitLocalePreferences();
     }
 
     static {
@@ -136,110 +137,148 @@ public void TestParseUnit() {
 
     public void TestUnitPreferences() {
         UnitPreferences prefs = SDI.getUnitPreferences();
-        if (true) {
-            try (TempPrintWriter pw =
-                            TempPrintWriter.openUTF8Writer(
-                                    CLDRPaths.TEST_DATA + "units", "unitPreferencesTest.txt");
-                    TempPrintWriter pwLocale =
-                            TempPrintWriter.openUTF8Writer(
-                                    CLDRPaths.TEST_DATA + "units",
-                                    "unitLocalePreferencesTest.txt")) {
-
-                pw.println(getHeader("Region"));
-                pwLocale.println(getHeader("Locale"));
-                Rational ONE_TENTH = Rational.of(1, 10);
+        try (TempPrintWriter pw =
+                TempPrintWriter.openUTF8Writer(
+                        CLDRPaths.TEST_DATA + "units", "unitPreferencesTest.txt")) {
+            pw.println(getHeader("Region"));
+            Rational ONE_TENTH = Rational.of(1, 10);
 
-                // Note that for production usage, precomputed data like the
-                // prefs.getFastMap(converter) would be used instead of the raw data.
+            // Note that for production usage, precomputed data like the
+            // prefs.getFastMap(converter) would be used instead of the raw data.
 
-                for (Entry<String, Map<String, Multimap<Set<String>, UnitPreference>>> entry :
-                        prefs.getData().entrySet()) {
-                    String quantity = entry.getKey();
-                    String baseUnit = converter.getBaseUnitFromQuantity(quantity);
-                    for (Entry<String, Multimap<Set<String>, UnitPreference>> entry2 :
-                            entry.getValue().entrySet()) {
-                        String usage = entry2.getKey();
+            for (Entry<String, Map<String, Multimap<Set<String>, UnitPreference>>> entry :
+                    prefs.getData().entrySet()) {
+                String quantity = entry.getKey();
+                String baseUnit = converter.getBaseUnitFromQuantity(quantity);
+                for (Entry<String, Multimap<Set<String>, UnitPreference>> entry2 :
+                        entry.getValue().entrySet()) {
+                    String usage = entry2.getKey();
 
-                        // collect samples of base units
-                        for (Entry<Set<String>, Collection<UnitPreference>> entry3 :
-                                entry2.getValue().asMap().entrySet()) {
-                            boolean first = true;
-                            Set<Rational> samples = new TreeSet<>(Comparator.reverseOrder());
-                            for (UnitPreference pref : entry3.getValue()) {
-                                final String topUnit =
-                                        UnitPreferences.SPLIT_AND
-                                                .split(pref.unit)
-                                                .iterator()
-                                                .next();
-                                if (first) {
-                                    samples.add(
-                                            converter.convert(
-                                                    pref.geq.add(ONE_TENTH),
-                                                    topUnit,
-                                                    baseUnit,
-                                                    false));
-                                    first = false;
-                                }
-                                samples.add(converter.convert(pref.geq, topUnit, baseUnit, false));
+                    // collect samples of base units
+                    for (Entry<Set<String>, Collection<UnitPreference>> entry3 :
+                            entry2.getValue().asMap().entrySet()) {
+                        boolean first = true;
+                        Set<Rational> samples = new TreeSet<>(Comparator.reverseOrder());
+                        for (UnitPreference pref : entry3.getValue()) {
+                            final String topUnit =
+                                    UnitPreferences.SPLIT_AND.split(pref.unit).iterator().next();
+                            if (first) {
                                 samples.add(
                                         converter.convert(
-                                                pref.geq.subtract(ONE_TENTH),
-                                                topUnit,
-                                                baseUnit,
-                                                false));
+                                                pref.geq.add(ONE_TENTH), topUnit, baseUnit, false));
+                                first = false;
                             }
-                            // show samples
-                            Set<String> regions = entry3.getKey();
-                            String sampleRegion = regions.iterator().next();
-                            Collection<UnitPreference> uprefs = entry3.getValue();
-                            for (Rational sample : samples) {
-                                showSample(
-                                        quantity,
-                                        usage,
-                                        sampleRegion,
-                                        sample,
-                                        baseUnit,
-                                        uprefs,
-                                        pw);
-                                for (String sampleLocale : getSampleLocales(regions)) {
-                                    showSample(
-                                            quantity,
-                                            usage,
-                                            sampleLocale,
-                                            sample,
+                            samples.add(converter.convert(pref.geq, topUnit, baseUnit, false));
+                            samples.add(
+                                    converter.convert(
+                                            pref.geq.subtract(ONE_TENTH),
+                                            topUnit,
                                             baseUnit,
-                                            uprefs,
-                                            pwLocale);
-                                }
-                            }
-                            pw.println();
-                            pwLocale.println();
+                                            false));
+                        }
+                        // show samples
+                        Set<String> regions = entry3.getKey();
+                        String sampleRegion = regions.iterator().next();
+                        Collection<UnitPreference> uprefs = entry3.getValue();
+                        for (Rational sample : samples) {
+                            showSample(quantity, usage, sampleRegion, sample, baseUnit, uprefs, pw);
                         }
+                        pw.println();
                     }
                 }
             }
         }
     }
 
-    static LikelySubtags likely = new LikelySubtags();
+    public void generateUnitLocalePreferences() {
+        try (TempPrintWriter pwLocale =
+                TempPrintWriter.openUTF8Writer(
+                        CLDRPaths.TEST_DATA + "units", "unitLocalePreferencesTest.txt")) {
 
-    private Set<String> getSampleLocales(Set<String> regions) {
-        Set<String> result = new TreeSet<>();
-        int count = 2;
-        for (String region : regions) {
-            if (--count < 0) {
-                break;
+            try {
+                Set<List<Object>> seen = new HashSet<>();
+                // first copy existing lines
+                // This includes the header, so modify the old header if changes are needed!
+                Files.lines(Path.of(CLDRPaths.TEST_DATA + "units/unitLocalePreferencesTest.txt"))
+                        .forEach(line -> formatPwLocale(pwLocale, line, seen));
+                // TODO: add more lines
+                formatLocaleLine(
+                        "byte-per-millisecond", Rational.of(123), "default", "en", "", seen);
+            } catch (IOException e) {
+                throw new UncheckedIOException(e);
+            }
+        }
+    }
+
+    static final Splitter SPLIT_SEMI = Splitter.on(Pattern.compile("\\s*;\\s*")).trimResults();
+
+    private void formatPwLocale(TempPrintWriter pwLocale, String rawLine, Set<List<Object>> seen) {
+        int hashPos = rawLine.indexOf('#');
+        String line = hashPos < 0 ? rawLine : rawLine.substring(0, hashPos);
+        String comment = hashPos < 0 ? "" : "#" + rawLine.substring(hashPos + 1);
+        if (line.isBlank()) {
+            if (!comment.isBlank()) {
+                pwLocale.println(comment);
             }
-            String max = likely.maximize("und_" + region);
-            String lang = CLDRLocale.getInstance(max).getLanguage();
-            result.add(lang);
-            result.add("zu_" + region);
+            return;
+        }
+        List<String> parts = SPLIT_SEMI.splitToList(line);
+
+        String sourceUnit = parts.get(0);
+        Rational sourceAmount = Rational.of(parts.get(1));
+        String usage = parts.get(2);
+        String languageTag = parts.get(3);
+        String newLine =
+                formatLocaleLine(sourceUnit, sourceAmount, usage, languageTag, comment, seen);
+        if (newLine != null) {
+            pwLocale.println(newLine);
         }
-        return result;
     }
 
+    public String formatLocaleLine(
+            String sourceUnit,
+            Rational sourceAmount,
+            String usage,
+            String languageTag,
+            String comment,
+            Set<List<Object>> seen) {
+        List<Object> bundle = List.of(sourceUnit, sourceAmount, usage, languageTag);
+        if (bundle.contains(seen)) {
+            return null;
+        }
+        seen.add(bundle);
+
+        UnitPreferences prefs = SDI.getUnitPreferences();
+        final ULocale uLocale = ULocale.forLanguageTag(languageTag);
+        UnitPreference unitPreference =
+                prefs.getUnitPreference(sourceAmount, sourceUnit, usage, uLocale);
+        if (unitPreference == null) { // if the quantity isn't found
+            throw new IllegalArgumentException(
+                    String.format(
+                            "No unit preferences found for unit: %s, usage: %s, locale:%s",
+                            sourceUnit, usage, languageTag));
+        }
+        String actualUnit = unitPreference.unit;
+        Rational actualValue =
+                converter.convert(sourceAmount, sourceUnit, unitPreference.unit, false);
+        // #    input-unit; amount; usage;  languageTag; expected-unit; expected-amount # comment
+        final String newFileLine =
+                String.format(
+                        "%s;\t%s;\t%s;\t%s;\t%s;\t%s%s",
+                        sourceUnit,
+                        sourceAmount.toString(FormatStyle.formatted),
+                        usage,
+                        languageTag,
+                        actualUnit,
+                        actualValue.toString(FormatStyle.formatted),
+                        comment.isBlank() ? "" : "\t" + comment);
+        return newFileLine;
+    }
+
+    static LikelySubtags likely = new LikelySubtags();
+
     public String getHeader(String regionOrLocale) {
-        return "\n# Test data for unit preferences\n"
+        return "\n# Test data for unit region preferences\n"
                 + CldrUtility.getCopyrightString("#  ")
                 + "\n"
                 + "#\n"
@@ -398,19 +437,4 @@ private void checkUnitConvertability(
             }
         }
     }
-
-    private void testIcu() {
-        UnlocalizedNumberFormatter nf =
-                NumberFormatter.with().unitWidth(UnitWidth.FULL_NAME).usage("road");
-
-        Object tests[][] = {{1d, MeasureUnit.MILE, "en", "result"}};
-        for (Object test[] : tests) {
-            Double value = (Double) test[0];
-            MeasureUnit unit = (MeasureUnit) test[1];
-            final LocalizedNumberFormatter localized =
-                    nf.locale(Locale.forLanguageTag((String) test[2]));
-            String actual = (String) test[3];
-            actual = localized.format(new Measure(value, unit)).toString();
-        }
-    }
 }
diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/util/GrammarInfo.java b/tools/cldr-code/src/main/java/org/unicode/cldr/util/GrammarInfo.java
@@ -761,6 +761,10 @@ public static Set<String> getSpecialsToTranslate() {
         return INCLUDE_OTHER;
     }
 
+    public static final Set<String> SUPPRESS_ADDING_GRAMMAR =
+            ImmutableSet.of(
+                    "dot-per-centimeter", "millimeter-ofhg", "milligram-ofglucose-per-deciliter");
+
     public static final boolean DEBUG = false;
     /** Internal class for thread-safety */
     static class UnitsToAddGrammar {
@@ -786,7 +790,8 @@ static class UnitsToAddGrammar {
                 if (!EXCLUDE_GRAMMAR.contains(shortUnit)) {
                     Set<UnitSystem> systems = converter.getSystemsEnum(shortUnit);
                     // we now add all SI and metric and si_acceptable and metric_adjacent
-                    if (!Collections.disjoint(systems, UnitSystem.SiOrMetric)) {
+                    if (!Collections.disjoint(systems, UnitSystem.SiOrMetric)
+                            && !SUPPRESS_ADDING_GRAMMAR.contains(unit)) {
                         _data.add(unit);
                         continue;
                     }