From 31aa6d928eb0ea0bdbed9969c3db68cff6cfc28f Mon Sep 17 00:00:00 2001 From: Peter Edberg <42151464+pedberg-icu@users.noreply.github.com> Date: Thu, 28 Sep 2023 15:34:10 -0700 Subject: [PATCH 01/10] CLDR-17126 Update CLDR segment tailorings to reflect ICU 74 word break changes (#3289) --- common/segments/en_US_POSIX.xml | 1 - common/segments/root.xml | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/common/segments/en_US_POSIX.xml b/common/segments/en_US_POSIX.xml index 16edb9be64d..fa69712f07f 100644 --- a/common/segments/en_US_POSIX.xml +++ b/common/segments/en_US_POSIX.xml @@ -16,7 +16,6 @@ For terms of use, see http://www.unicode.org/copyright.html [[$MidNumLet]-[.]] - [[$MidLetter]-[\:]] [[$MidNum] [.]] diff --git a/common/segments/root.xml b/common/segments/root.xml index ca386fd45e9..f4609b8f247 100644 --- a/common/segments/root.xml +++ b/common/segments/root.xml @@ -375,7 +375,7 @@ For terms of use, see http://www.unicode.org/copyright.html [\p{Word_Break=Format}] \p{Word_Break=Katakana} \p{Word_Break=ALetter} - \p{Word_Break=MidLetter} + [\p{Word_Break = MidLetter} - [\: \uFE55 \uFF1A]] \p{Word_Break=MidNum} \p{Word_Break=MidNumLet} \p{Word_Break=Numeric} From d52192c9e92db518f706a45faf96d970622382b7 Mon Sep 17 00:00:00 2001 From: Peter Edberg <42151464+pedberg-icu@users.noreply.github.com> Date: Fri, 29 Sep 2023 11:31:13 -0700 Subject: [PATCH 02/10] CLDR-17129 Fix inconsistency in fr dayPeriods for night1; always use [du] matin, not [de la] nuit (#3291) * Also allow display collisions between names for night1 and morning1/am if night starts at 00:00 --- common/main/fr.xml | 2 +- .../java/org/unicode/cldr/util/DayPeriodInfo.java | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/common/main/fr.xml b/common/main/fr.xml index afd5b86f490..51b9019c681 100644 --- a/common/main/fr.xml +++ b/common/main/fr.xml @@ -3026,7 +3026,7 @@ Warnings: All cp values have U+FE0F characters removed. See /annotationsDerived/ matin après-midi soir - nuit + matin ↑↑↑ diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/util/DayPeriodInfo.java b/tools/cldr-code/src/main/java/org/unicode/cldr/util/DayPeriodInfo.java index 1c68199ae2f..e7344fb9611 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/util/DayPeriodInfo.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/util/DayPeriodInfo.java @@ -431,6 +431,20 @@ public boolean collisionIsError( return false; } + // Hack for French night1, CLDR-17132 for better fix + if ((dayPeriod1 == DayPeriod.night1 + && (dayPeriod2 == DayPeriod.morning1 || dayPeriod2 == DayPeriod.am)) + || (dayPeriod2 == DayPeriod.night1 + && (dayPeriod1 == DayPeriod.morning1 || dayPeriod1 == DayPeriod.am))) { + if (dayPeriodsToSpans.get(DayPeriod.night1).size() == 1) { + for (Span s : dayPeriodsToSpans.get(DayPeriod.night1)) { + if (s.start == MIDNIGHT) { + return false; + } + } + } + } + // we use the more lenient if they are mixed types if (type2 == Type.format) { type1 = Type.format; From b2ca78e6bd7ff62c531830c7c20f3c0cb06b8616 Mon Sep 17 00:00:00 2001 From: Peter Edberg <42151464+pedberg-icu@users.noreply.github.com> Date: Fri, 29 Sep 2023 13:31:24 -0700 Subject: [PATCH 03/10] CLDR-17097 Update ICU4J libs #4, new ICU4J mavenization; update README data & status (#3290) --- README.md | 9 ++++----- tools/cldr-apps/pom.xml | 2 +- .../org/unicode/cldr/unittest/web/perf/PerfTest.java | 2 +- tools/cldr-code/pom.xml | 2 +- .../src/main/java/org/unicode/cldr/icu/FixEras.java | 2 +- .../src/main/java/org/unicode/cldr/icu/RBNFWriter.java | 2 +- .../java/org/unicode/cldr/posix/GenerateCharmap.java | 2 +- .../java/org/unicode/cldr/posix/GeneratePOSIX.java | 2 +- .../java/org/unicode/cldr/test/ConsoleCheckCLDR.java | 2 +- .../main/java/org/unicode/cldr/tool/CLDRModify.java | 2 +- .../main/java/org/unicode/cldr/tool/CompareData.java | 2 +- .../org/unicode/cldr/tool/GenerateSidewaysView.java | 2 +- .../main/java/org/unicode/cldr/tool/KeyboardTool.java | 2 +- .../src/main/java/org/unicode/cldr/tool/Misc.java | 2 +- .../main/java/org/unicode/cldr/tool/ModifyCase.java | 2 +- .../src/main/java/org/unicode/cldr/tool/ShowData.java | 2 +- tools/cldr-rdf/pom.xml | 2 +- tools/pom.xml | 10 ++++++---- 18 files changed, 26 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index 6f98b05d0e7..c0b963b477a 100644 --- a/README.md +++ b/README.md @@ -10,15 +10,14 @@ For current CLDR release information, see [cldr.unicode.org](https://cldr.unicod ## Status -Update: 2023-08-31 +Update: 2023-09-28 -> **Note:** This is a preliminary version of CLDR 44, intended for those wishing to do pre-release testing. It is not recommended for production use.

- - + +**Note:** This is a pre-release candidate version of CLDR 44, intended for testing. It is not recommended for production use. + ### What is CLDR? The Unicode CLDR provides key building blocks for software to support the world's languages, with the largest and most extensive standard repository of locale data available. This data is used by a wide spectrum of companies for their software internationalization and localization, adapting software to the conventions of different languages for such common software tasks. diff --git a/tools/cldr-apps/pom.xml b/tools/cldr-apps/pom.xml index 1841e85bbc0..8f4302663f4 100644 --- a/tools/cldr-apps/pom.xml +++ b/tools/cldr-apps/pom.xml @@ -62,7 +62,7 @@ com.ibm.icu - icu4j-for-cldr + icu4j diff --git a/tools/cldr-apps/src/test/java/org/unicode/cldr/unittest/web/perf/PerfTest.java b/tools/cldr-apps/src/test/java/org/unicode/cldr/unittest/web/perf/PerfTest.java index e07f5c0a835..1d834f6e321 100644 --- a/tools/cldr-apps/src/test/java/org/unicode/cldr/unittest/web/perf/PerfTest.java +++ b/tools/cldr-apps/src/test/java/org/unicode/cldr/unittest/web/perf/PerfTest.java @@ -6,7 +6,7 @@ */ package org.unicode.cldr.unittest.web.perf; -import com.ibm.icu.dev.tool.UOption; +import com.ibm.icu.dev.tool.shared.UOption; import com.ibm.icu.impl.LocaleUtility; import java.io.BufferedReader; import java.io.FileInputStream; diff --git a/tools/cldr-code/pom.xml b/tools/cldr-code/pom.xml index bce3f78f492..5d7abceecfb 100644 --- a/tools/cldr-code/pom.xml +++ b/tools/cldr-code/pom.xml @@ -28,7 +28,7 @@ com.ibm.icu - icu4j-for-cldr + icu4j diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/icu/FixEras.java b/tools/cldr-code/src/main/java/org/unicode/cldr/icu/FixEras.java index 963bb69dddd..76da2d76616 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/icu/FixEras.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/icu/FixEras.java @@ -8,7 +8,7 @@ */ package org.unicode.cldr.icu; -import com.ibm.icu.dev.tool.UOption; +import com.ibm.icu.dev.tool.shared.UOption; import java.io.FileOutputStream; import java.io.OutputStreamWriter; import java.io.PrintWriter; diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/icu/RBNFWriter.java b/tools/cldr-code/src/main/java/org/unicode/cldr/icu/RBNFWriter.java index 712d2b693b8..0ad95c7ee82 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/icu/RBNFWriter.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/icu/RBNFWriter.java @@ -8,7 +8,7 @@ */ package org.unicode.cldr.icu; -import com.ibm.icu.dev.tool.UOption; +import com.ibm.icu.dev.tool.shared.UOption; import com.ibm.icu.impl.Utility; import com.ibm.icu.text.SimpleDateFormat; import com.ibm.icu.util.Calendar; diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/posix/GenerateCharmap.java b/tools/cldr-code/src/main/java/org/unicode/cldr/posix/GenerateCharmap.java index 8672f9c8a43..b45069f58c0 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/posix/GenerateCharmap.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/posix/GenerateCharmap.java @@ -8,7 +8,7 @@ */ package org.unicode.cldr.posix; -import com.ibm.icu.dev.tool.UOption; +import com.ibm.icu.dev.tool.shared.UOption; import com.ibm.icu.impl.Utility; import com.ibm.icu.text.UnicodeSet; import com.ibm.icu.text.UnicodeSetIterator; diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/posix/GeneratePOSIX.java b/tools/cldr-code/src/main/java/org/unicode/cldr/posix/GeneratePOSIX.java index 0ba2ea23919..94d65d11b5d 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/posix/GeneratePOSIX.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/posix/GeneratePOSIX.java @@ -8,7 +8,7 @@ */ package org.unicode.cldr.posix; -import com.ibm.icu.dev.tool.UOption; +import com.ibm.icu.dev.tool.shared.UOption; import com.ibm.icu.text.UnicodeSet; import java.io.File; import java.io.PrintWriter; diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/test/ConsoleCheckCLDR.java b/tools/cldr-code/src/main/java/org/unicode/cldr/test/ConsoleCheckCLDR.java index f3965cda842..0d8b72db6a8 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/test/ConsoleCheckCLDR.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/test/ConsoleCheckCLDR.java @@ -1,6 +1,6 @@ package org.unicode.cldr.test; -import com.ibm.icu.dev.tool.UOption; +import com.ibm.icu.dev.tool.shared.UOption; import com.ibm.icu.dev.util.ElapsedTimer; import com.ibm.icu.impl.Relation; import com.ibm.icu.impl.Row; diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/CLDRModify.java b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/CLDRModify.java index e894b537143..9514b2fcbb1 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/CLDRModify.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/CLDRModify.java @@ -8,7 +8,7 @@ import com.google.common.base.Joiner; import com.google.common.base.Splitter; -import com.ibm.icu.dev.tool.UOption; +import com.ibm.icu.dev.tool.shared.UOption; import com.ibm.icu.impl.Utility; import com.ibm.icu.text.Collator; import com.ibm.icu.text.DateTimePatternGenerator; diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/CompareData.java b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/CompareData.java index da11f4657d1..845bdd4c865 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/CompareData.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/CompareData.java @@ -1,6 +1,6 @@ package org.unicode.cldr.tool; -import com.ibm.icu.dev.tool.UOption; +import com.ibm.icu.dev.tool.shared.UOption; import com.ibm.icu.text.Collator; import com.ibm.icu.text.RuleBasedCollator; import com.ibm.icu.util.ULocale; diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateSidewaysView.java b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateSidewaysView.java index 82917dc6a25..09fe9425e43 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateSidewaysView.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateSidewaysView.java @@ -10,7 +10,7 @@ import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; -import com.ibm.icu.dev.tool.UOption; +import com.ibm.icu.dev.tool.shared.UOption; import com.ibm.icu.dev.util.UnicodeMap; import com.ibm.icu.impl.Relation; import com.ibm.icu.impl.Utility; diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/KeyboardTool.java b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/KeyboardTool.java index 8f702bbfaed..4e0da24b08e 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/KeyboardTool.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/KeyboardTool.java @@ -1,6 +1,6 @@ package org.unicode.cldr.tool; -import com.ibm.icu.dev.tool.UOption; +import com.ibm.icu.dev.tool.shared.UOption; import org.unicode.cldr.util.CLDRTool; @CLDRTool(alias = "kbd", description = "Tool for working with CLDR Keyboard files") diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/Misc.java b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/Misc.java index cf9a33636b9..b958aab76cb 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/Misc.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/Misc.java @@ -8,7 +8,7 @@ */ package org.unicode.cldr.tool; -import com.ibm.icu.dev.tool.UOption; +import com.ibm.icu.dev.tool.shared.UOption; import com.ibm.icu.impl.Utility; import com.ibm.icu.lang.UCharacter; import com.ibm.icu.text.Collator; diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ModifyCase.java b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ModifyCase.java index 8df879ecbc2..d5d28a470da 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ModifyCase.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ModifyCase.java @@ -9,7 +9,7 @@ package org.unicode.cldr.tool; -import com.ibm.icu.dev.tool.UOption; +import com.ibm.icu.dev.tool.shared.UOption; import com.ibm.icu.lang.UCharacter; import java.io.BufferedWriter; import java.io.FileWriter; diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ShowData.java b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ShowData.java index c66dd1270e1..3a1a799171a 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ShowData.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ShowData.java @@ -7,7 +7,7 @@ package org.unicode.cldr.tool; import com.google.common.base.Joiner; -import com.ibm.icu.dev.tool.UOption; +import com.ibm.icu.dev.tool.shared.UOption; import com.ibm.icu.impl.Relation; import com.ibm.icu.lang.UScript; import com.ibm.icu.text.Collator; diff --git a/tools/cldr-rdf/pom.xml b/tools/cldr-rdf/pom.xml index e1825a7c2ba..8209bd21ed9 100644 --- a/tools/cldr-rdf/pom.xml +++ b/tools/cldr-rdf/pom.xml @@ -54,7 +54,7 @@ com.ibm.icu - icu4j-for-cldr + icu4j diff --git a/tools/pom.xml b/tools/pom.xml index f3f50e945c4..f8237a92deb 100644 --- a/tools/pom.xml +++ b/tools/pom.xml @@ -20,9 +20,11 @@ UTF-8 11 11 - - 74.0.1-SNAPSHOT-cldr-2023-08-22 + + 74.1-SNAPSHOT 5.8.2 3.1.0 3.11.1 @@ -58,7 +60,7 @@ com.ibm.icu - icu4j-for-cldr + icu4j ${icu4j.version} From fadf398713d321f8bb4dd99b27f5fb17fae9c8a6 Mon Sep 17 00:00:00 2001 From: Mark Davis Date: Mon, 2 Oct 2023 09:33:10 +0200 Subject: [PATCH 04/10] CLDR-17026 Fix Cyrillic romanization (#3296) * CLDR-17026 Fix Cyrillic romanization * CLDR-17026 adjust filter * CLDR-17026 Fix for spotless --- common/transforms/Cyrillic-Latin.xml | 21 +++++++------------ .../unicode/cldr/unittest/TestTransforms.java | 13 +++++------- 2 files changed, 13 insertions(+), 21 deletions(-) diff --git a/common/transforms/Cyrillic-Latin.xml b/common/transforms/Cyrillic-Latin.xml index 467955b2e85..9324e884f48 100644 --- a/common/transforms/Cyrillic-Latin.xml +++ b/common/transforms/Cyrillic-Latin.xml @@ -17,6 +17,7 @@ For terms of use, see http://www.unicode.org/copyright.html # :: [\u0000-\u007E ʹ ʺ [:Cyrillic:] [:Latin:] [:nonspacing mark:]] ; ### WARNING, ̈ must be added to the generated filters, in both directions ### # MINIMAL FILTER +# Cyrillic-Latin :: [ҺһңҢҰұҮүөӨҚқ̈Ă-ăĔ-ĕĞ-ğĬ-ĭŎ-ŏŬ-ŭ̆Ѐ-џҐ-ҕҘ-ҙӁ-ӂӐ-ӟӢ-ӧӬ-ӵӸ-ӹḜ-ḝẮ-ặᾰᾸῐῘῠῨ] ; :: NFD (NFC) ; $modprime = ʹ; @@ -75,15 +76,10 @@ $beforeLower = $ignoreForCase * $lower ; Ћ ↔ C $acute ; # CYRILLIC CAPITAL LETTER TSHE џ ↔ d $hat ; # CYRILLIC SMALL LETTER DZHE Џ ↔ D $hat ; # CYRILLIC CAPITAL LETTER DZHE - -х ↔ kh ; # CYRILLIC SMALL LETTER HA -Х } $beforeLower ↔ Kh ; -Х ↔ KH; # CYRILLIC CAPITAL LETTER HA - -# Insert separator between K and characters that result in h -# And delete going the other way -[Kk] { } [Һһ] → ‧ ; -← ‧ ; +# https://www.eki.ee/wgrs/v2_2/rom2_az.htm +# but modified to not collide with Cyrillic HA +һ ↔ h $breveBelow ; # CYRILLIC SMALL LETTER SHHA +Һ ↔ H $breveBelow; # CYRILLIC CAPITAL LETTER SHHA # Normal order а ↔ a ; # CYRILLIC SMALL LETTER A @@ -179,9 +175,8 @@ $beforeLower = $ignoreForCase * $lower ; # Ѹ ↔ XXX ; # CYRILLIC CAPITAL LETTER UK ф ↔ f ; # CYRILLIC SMALL LETTER EF Ф ↔ F ; # CYRILLIC CAPITAL LETTER EF -#https://www.eki.ee/wgrs/v2_2/rom2_az.htm -һ ↔ h ; # CYRILLIC SMALL LETTER SHHA -Һ ↔ H ; # CYRILLIC CAPITAL LETTER SHHA +х ↔ h ; # CYRILLIC SMALL LETTER HA +Х ↔ H; # CYRILLIC CAPITAL LETTER HA # ҳ ↔ XXX ; # CYRILLIC SMALL LETTER HA WITH DESCENDER # Ҳ ↔ XXX ; # CYRILLIC CAPITAL LETTER HA WITH DESCENDER # ѡ ↔ XXX ; # CYRILLIC SMALL LETTER OMEGA @@ -295,7 +290,7 @@ $ignore = [[:Mark:]''] * ; # note: a global filter is more efficient, but MUST include all source chars!! # :: ([\u0000-\u007E ʹ ʺ [:Cyrillic:] [:Latin:] [:nonspacing mark:] ‧]); # MINIMAL FILTER: Latin-Cyrillic -:: ( [hH‧ˌ̈A-Za-zÀ-ÏÑ-ÖÙ-Ýà-ïñ-öù-ýÿ-ĥĨ-İĴ-ķĹ-ľŃ-ňŌ-őŔ-ťŨ-žƏƠ-ơƯ-ưǍ-ǜǞ-ǣǦ-ǰǴ-ǵǸ-țȞ-ȟȦ-ȳəʹ-ʺ̀-̂̆-̦̱̇̌̀-́̈́ʹ΅-ΆΈ-ΊΌΎ-ΐά-ΰό-ώϓЀЃЌ-ЎЙйѐѓќ-ўӁ-ӂӐ-ӑӖ-ӗḀ-ẙẛẠ-ỹἂ-ἅἊ-Ἅἒ-ἕἚ-Ἕἢ-ἥἪ-Ἥἲ-ἵἺ-Ἵὂ-ὅὊ-Ὅὒ-ὕὛὝὢ-ὥὪ-Ὥὰ-ώᾂ-ᾅᾊ-ᾍᾒ-ᾕᾚ-ᾝᾢ-ᾥᾪ-ᾭᾰᾲᾴᾸᾺ-ΆῂῄῈ-Ή῍-῎ῐῒ-ΐῘῚ-Ί῝-῞ῠῢ-ΰῨῪ-Ύ῭-΅ῲῴῸ-ΏK-Å] ) ; +:: ( [ḫḪhH‧ˌ̈A-Za-zÀ-ÏÑ-ÖÙ-Ýà-ïñ-öù-ýÿ-ĥĨ-İĴ-ķĹ-ľŃ-ňŌ-őŔ-ťŨ-žƏƠ-ơƯ-ưǍ-ǜǞ-ǣǦ-ǰǴ-ǵǸ-țȞ-ȟȦ-ȳəʹ-ʺ̀-̂̆-̦̱̇̌̀-́̈́ʹ΅-ΆΈ-ΊΌΎ-ΐά-ΰό-ώϓЀЃЌ-ЎЙйѐѓќ-ўӁ-ӂӐ-ӑӖ-ӗḀ-ẙẛẠ-ỹἂ-ἅἊ-Ἅἒ-ἕἚ-Ἕἢ-ἥἪ-Ἥἲ-ἵἺ-Ἵὂ-ὅὊ-Ὅὒ-ὕὛὝὢ-ὥὪ-Ὥὰ-ώᾂ-ᾅᾊ-ᾍᾒ-ᾕᾚ-ᾝᾢ-ᾥᾪ-ᾭᾰᾲᾴᾸᾺ-ΆῂῄῈ-Ή῍-῎ῐῒ-ΐῘῚ-Ί῝-῞ῠῢ-ΰῨῪ-Ύ῭-΅ῲῴῸ-ΏK-Å] ) ; ]]> diff --git a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestTransforms.java b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestTransforms.java index 3a1b3475994..d183c6bb6b8 100644 --- a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestTransforms.java +++ b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestTransforms.java @@ -116,16 +116,13 @@ public void TestCyrillicLatin() { Transliterator latin_cyrillic = cyrillic_latin.getInverse(); checkSimpleRoundTrip(cyrillic_latin, latin_cyrillic, new UnicodeSet("[ӧӦ ӱӰӯӮ\\p{M}]")); String[][] tests = { - {"х", "kh"}, - {"Ха", "Kha"}, - {"Х", "KH"}, - {"кһ", "k‧h"}, - {"Кһа", "K‧ha"}, - {"КҺ", "K‧H"}, + {"х", "h"}, + {"Ха", "Ha"}, + {"Х", "H"}, {"к", "k"}, {"К", "K"}, - {"һ", "h"}, - {"Һ", "H"}, + {"һ", "ḫ"}, + {"Һ", "Ḫ"}, }; int count = 0; for (String[] test : tests) { From 57571f32264b1e53e7e24e50042b2037101c451c Mon Sep 17 00:00:00 2001 From: Mark Davis Date: Mon, 2 Oct 2023 19:43:50 +0200 Subject: [PATCH 05/10] CLDR-17099 Fix ICU breakage with likelysubtags (#3298) --- common/supplemental/likelySubtags.xml | 3 +++ .../localeIdentifiers/likelySubtags.txt | 20 +++++++++---------- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/common/supplemental/likelySubtags.xml b/common/supplemental/likelySubtags.xml index 2a4497b2dec..1a13b214fe2 100644 --- a/common/supplemental/likelySubtags.xml +++ b/common/supplemental/likelySubtags.xml @@ -1653,7 +1653,9 @@ not be patched by hand, as any changes made in that fashion may be lost. + + @@ -1733,6 +1735,7 @@ not be patched by hand, as any changes made in that fashion may be lost. + diff --git a/common/testData/localeIdentifiers/likelySubtags.txt b/common/testData/localeIdentifiers/likelySubtags.txt index 23bceab6ce6..1f9e3557fff 100644 --- a/common/testData/localeIdentifiers/likelySubtags.txt +++ b/common/testData/localeIdentifiers/likelySubtags.txt @@ -1259,7 +1259,7 @@ und-Latn-001 ; en-Latn-001 ; en-001 ; und-Latn-150 ; en-Latn-150 ; en-150 ; und-Latn-419 ; es-Latn-419 ; es-419 ; und-Latn-AD ; ca-Latn-AD ; ca-AD ; -und-Latn-AE ; ar-Latn-AE ; ; +und-Latn-AE ; en-Latn-AE ; en-AE ; und-Latn-AG ; en-Latn-AG ; en-AG ; und-Latn-AI ; en-Latn-AI ; en-AI ; und-Latn-AL ; sq-Latn-AL ; sq ; @@ -1288,7 +1288,7 @@ und-Latn-BS ; en-Latn-BS ; en-BS ; und-Latn-BW ; en-Latn-BW ; en-BW ; und-Latn-BZ ; en-Latn-BZ ; en-BZ ; und-Latn-CA ; en-Latn-CA ; en-CA ; -und-Latn-CC ; ms-Latn-CC ; ; +und-Latn-CC ; en-Latn-CC ; en-CC ; und-Latn-CD ; sw-Latn-CD ; sw-CD ; und-Latn-CF ; fr-Latn-CF ; fr-CF ; und-Latn-CG ; fr-Latn-CG ; fr-CG ; @@ -1316,7 +1316,7 @@ und-Latn-DZ ; fr-Latn-DZ ; fr-DZ ; und-Latn-EA ; es-Latn-EA ; es-EA ; und-Latn-EC ; es-Latn-EC ; es-EC ; und-Latn-EE ; et-Latn-EE ; et ; -und-Latn-ER ; ti-Latn-ER ; ; +und-Latn-ER ; en-Latn-ER ; en-ER ; und-Latn-ES ; es-Latn-ES ; es ; und-Latn-ET ; en-Latn-ET ; en-ET ; und-Latn-FI ; fi-Latn-FI ; fi ; @@ -1341,7 +1341,7 @@ und-Latn-GT ; es-Latn-GT ; es-GT ; und-Latn-GU ; en-Latn-GU ; en-GU ; und-Latn-GW ; pt-Latn-GW ; pt-GW ; und-Latn-GY ; en-Latn-GY ; en-GY ; -und-Latn-HK ; zh-Latn-HK ; ; +und-Latn-HK ; en-Latn-HK ; en-HK ; und-Latn-HN ; es-Latn-HN ; es-HN ; und-Latn-HR ; hr-Latn-HR ; hr ; und-Latn-HT ; ht-Latn-HT ; ht ; @@ -1349,9 +1349,9 @@ und-Latn-HU ; hu-Latn-HU ; hu ; und-Latn-IC ; es-Latn-IC ; es-IC ; und-Latn-ID ; id-Latn-ID ; id ; und-Latn-IE ; en-Latn-IE ; en-IE ; -und-Latn-IL ; he-Latn-IL ; he-Latn ; +und-Latn-IL ; en-Latn-IL ; en-IL ; und-Latn-IM ; en-Latn-IM ; en-IM ; -und-Latn-IN ; hi-Latn-IN ; hi-Latn ; +und-Latn-IN ; en-Latn-IN ; en-IN ; und-Latn-IO ; en-Latn-IO ; en-IO ; und-Latn-IS ; is-Latn-IS ; is ; und-Latn-IT ; it-Latn-IT ; it ; @@ -1385,7 +1385,7 @@ und-Latn-MR ; fr-Latn-MR ; fr-MR ; und-Latn-MS ; en-Latn-MS ; en-MS ; und-Latn-MT ; mt-Latn-MT ; mt ; und-Latn-MU ; mfe-Latn-MU ; mfe ; -und-Latn-MV ; dv-Latn-MV ; dv-Latn ; +und-Latn-MV ; en-Latn-MV ; en-MV ; und-Latn-MW ; en-Latn-MW ; en-MW ; und-Latn-MX ; es-Latn-MX ; es-MX ; und-Latn-MY ; ms-Latn-MY ; ms ; @@ -1406,7 +1406,7 @@ und-Latn-PE ; es-Latn-PE ; es-PE ; und-Latn-PF ; fr-Latn-PF ; fr-PF ; und-Latn-PG ; tpi-Latn-PG ; tpi ; und-Latn-PH ; fil-Latn-PH ; fil ; -und-Latn-PK ; ur-Latn-PK ; ur-Latn ; +und-Latn-PK ; en-Latn-PK ; en-PK ; und-Latn-PL ; pl-Latn-PL ; pl ; und-Latn-PM ; fr-Latn-PM ; fr-PM ; und-Latn-PN ; en-Latn-PN ; en-PN ; @@ -1420,7 +1420,7 @@ und-Latn-RS ; sr-Latn-RS ; sr-Latn ; und-Latn-RW ; rw-Latn-RW ; rw ; und-Latn-SB ; en-Latn-SB ; en-SB ; und-Latn-SC ; fr-Latn-SC ; fr-SC ; -und-Latn-SD ; ar-Latn-SD ; ; +und-Latn-SD ; en-Latn-SD ; en-SD ; und-Latn-SE ; sv-Latn-SE ; sv ; und-Latn-SG ; en-Latn-SG ; en-SG ; und-Latn-SH ; en-Latn-SH ; en-SH ; @@ -1432,7 +1432,7 @@ und-Latn-SM ; it-Latn-SM ; it-SM ; und-Latn-SN ; fr-Latn-SN ; fr-SN ; und-Latn-SO ; so-Latn-SO ; so ; und-Latn-SR ; nl-Latn-SR ; nl-SR ; -und-Latn-SS ; ar-Latn-SS ; ; +und-Latn-SS ; en-Latn-SS ; en-SS ; und-Latn-ST ; pt-Latn-ST ; pt-ST ; und-Latn-SV ; es-Latn-SV ; es-SV ; und-Latn-SX ; en-Latn-SX ; en-SX ; From b02618881e9db17e6d743e7b7d8d6afe482a62c2 Mon Sep 17 00:00:00 2001 From: Mark Davis Date: Mon, 2 Oct 2023 19:45:30 +0200 Subject: [PATCH 06/10] CLDR-16711 Uniqueness of units (#3297) * CLDR-16711 Uniqueness of units * CLDR-16711 tweaks --- docs/ldml/tr35-general.md | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/docs/ldml/tr35-general.md b/docs/ldml/tr35-general.md index 62970f2aa82..31ee64a8702 100644 --- a/docs/ldml/tr35-general.md +++ b/docs/ldml/tr35-general.md @@ -868,7 +868,7 @@ The long unit identifers are used as a key in the translated unit names for loca | day | duration-day | -The list of valid CLDR simple unit identifiers is found in _Section Validity Data](tr35.md#Validity_Data)_. +The list of valid CLDR simple unit identifiers is found in _[Section Validity Data](tr35.md#Validity_Data)_. These names should not be presented to end users, however: the translated names for different languages (or variants of English) are available in the CLDR localized data. All syntactically valid CLDR unit identifiers values that are not listed in the validity data are reserved by CLDR for additional future units. There is one exception: implementations that need to define their own unit identifiers can do so via _[Private-Use Units](#Private_Use_Units)_. @@ -994,7 +994,10 @@ Some of the constraints reference data from the unitIdComponents in [Unit_Conver or <unitIdComponent type=”and”>
or <unitIdComponent type=”per”>. -
  • Constraint: must not have a prefix as an initial segment.
+
  • Constraint: must not have a prefix as an initial segment.
  • +
  • Constraint: no two different base_components will share the first 8 letters. + (For more information, see Unit Identifier Uniqueness.) +
suffix_component:= @@ -1036,11 +1039,31 @@ For example: * Similarly, when a base_component is encountered, one can collect any suffix components, and stop. * Encountering a suffix_component in any other circumstance is an error. +### Unit Identifier Uniqueness +CLDR Unit Identifiers can be used as values in locale identifiers. When that is done, the syntax is modified whenever a `prefixed_unit` would be longer than 8 characters. In such a case: + +* If there is no `prefix` the `prefixed_unit` is truncated to 8 characters. +* If there is a `prefix`, a hyphen is added between the `prefix` and the `base_component`. If that `base_component` is longer than 8 characters, it is truncated to 8 characters. + +_Example_ +| Unit identifer | BCP47 syntax example | Comment | +| ---- | ---- | ---- | +| kilogram | en-u-ux-kilogram | kilogram fits in 8 characters | +| centilux | en-u-ux-centilux | centilux fixs in 8 characters | +| steradian | en-u-ux-steradia | steradian exceeds 8 characters | +| centigram | en-u-ux-centi-gram | centigram exceeds 8 characters | +| kilometer | en-u-ux-kilo-meter | kilometer exceeds 8 characters | +| quectolux | en-u-ux-kilo-meter | kilometer exceeds 8 characters | + +This requires that each of the elements in base_components are unique to eight letters, that is: **no two different base_components will share the first 8 letters**. + +The reason that the `prefixed_unit` as a whole is not simply truncated to 8 characters is that would impose too strict a constraint. There are 5 letter prefixes such as 'centi' and more recently 6 letter prefixes such as 'quecto'. That would cause prefixed `base_component` as short as 'gram' and 'gray' to be ambiguous when truncated to 8 letters: 'centigra'; and 'lumen' and 'lux' would fail with the 6 letter prefixes. + ### Example Units The following table contains examples of groupings and units currently defined by CLDR. The units in CLDR are not comprehensive; it is anticipated that more will be added over time. -The complete list of supported units is in the validity data: see _Section Validity Data](tr35.md#Validity_Data)_. +The complete list of supported units is in the validity data: see _[Section Validity Data](tr35.md#Validity_Data)_. | Type | Core Unit Identifier | Compound? | Sample Format | | -------------- | ------------------------ | --------- | -------------- | @@ -1164,7 +1187,7 @@ There are three widths: **long**, **short**, and **narrow**. As usual, the narro Where the unit of measurement is one of the [International System of Units (SI)](https://physics.nist.gov/cuu/Units/units.html), the short and narrow forms will typically use the international symbols, such as “mm” for millimeter. They may, however, be different if that is customary for the language or locale. For example, in Russian it may be more typical to see the Cyrillic characters “мм”. -Units are included for translation even where they are not typically used in a particular locale, such as kilometers in the US, or inches in Germany. This is to account for use by travelers and specialized domains, such as the German “Fernseher von 32 bis 55 Zoll (80 bis 140 cm)” for TV screen size in inches and centimeters. +Units are sometimes included for translation even where they are not typically used in a particular locale, such as kilometers in the US, or inches in Germany. This is to account for use by travelers and specialized domains, such as the German “Fernseher von 32 bis 55 Zoll (80 bis 140 cm)” for TV screen size in inches and centimeters. For temperature, there is a special unit ``, which is used when it is clear from context whether Celcius or Fahrenheit is implied. From 7d62b11107e90280c4bda4530bbcaa1521cb9f8e Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Mon, 2 Oct 2023 18:10:27 -0500 Subject: [PATCH 07/10] CLDR-16918 kbd: respond to macchiati feedback (#3277) * CLDR-16918 kbd: Feedback pt 1: use und-Deva - and encourage explicit lists of locale IDs * CLDR-16918 kbd: MD Feedback pt 3: wording - MUST not include k0 in additional language * CLDR-16918 kbd: MD Feedback pt 4: remove brackets on XML - changed the XML fragments to valid XML, without [] or ()? denoting optionality - the body text discusses which attributes are required - also deleted some obsolete and incorrect attribute examples * CLDR-16918 kbd: MD Feedback pt 5: clarify unlocalized - layout and indicator are not localized * CLDR-16918 kbd: MD Feedback pt 6: clarify unlocalized names * CLDR-16918 kbd: MD Feedback pt 7,8: copy edit * CLDR-16918 kbd: MD Feedback pt 9: update 'accessibility' - no longer discourages certain features - retains some general guidance * CLDR-16918 kbd: MD Feedback pt 10: reference to example values - Also mention use of DTD Annotations - Also update the DTD compatibility notice per CLDR-17078 * CLDR-16918 kbd: MD Feedback pt 11: quoting 'no' * CLDR-16918 kbd: MD Feedback pt 12: update import section * CLDR-16918 kbd: MD Feedback pt 15: imports / CLDR version - note that implementations don't have to carry ALL CLDR versions - remaining questions are repository policy * CLDR-16918 kbd: macchiato Feedback pt 17: touch-only - layout authors aren't forced to include a hardware layout - however, give some reason for the recommendation * CLDR-16918 kbd: macchiato Feedback pt 18: about none - copyedit * CLDR-16918 kbd: macchiato Feedback pt 19: about opt and option - copyedit * CLDR-16918 kbd: macchiato Feedback pt 21: modifier matching * CLDR-16918 kbd: macchiato Feedback pt 21bis: modifier matching * CLDR-16918 kbd: Update per code review * CLDR-16918 Update docs/ldml/tr35-keyboards.md * CLDR-16918 Update docs/ldml/tr35-keyboards.md * CLDR-16918 kbd: spec: update per review comments Co-authored-by: Marc Durdin * CLDR-16918 Update docs/ldml/tr35-keyboards.md Co-authored-by: Marc Durdin --------- Co-authored-by: Marc Durdin --- docs/ldml/tr35-keyboards.md | 106 ++++++++++++++++++++---------------- 1 file changed, 60 insertions(+), 46 deletions(-) diff --git a/docs/ldml/tr35-keyboards.md b/docs/ldml/tr35-keyboards.md index df90c80ecfa..44434e155fa 100644 --- a/docs/ldml/tr35-keyboards.md +++ b/docs/ldml/tr35-keyboards.md @@ -147,6 +147,8 @@ The data can also be used in analysis of the capabilities of different keyboards For complete examples, see the XML files in the CLDR source repository. +Attribute values should be evaluated considering the DTD and [DTD Annotations](#DTD_Annotations). + * * * ## Goals and Non-goals @@ -184,17 +186,16 @@ Note that in parts of this document, the format `@x` is used to indicate the _at ### Compatibility Notice -> 👉 Note: CLDR-TC has agreed that the changes required were too extensive to maintain compatibility. For this reason, the DTD used here is _not_ compatible with DTDs from prior versions of CLDR such as v41 and prior. -> -> To process earlier XML files, use the prior DTD and specification, such as v41 found at +> 👉 Note: CLDR-TC has agreed that the changes required were too extensive to maintain compatibility. For this reason, the `ldmlKeyboard3.dtd` DTD used here is _not_ compatible with DTDs from prior versions of CLDR such as v43 and prior. > +> To process earlier XML files, use the data and specification from v43.1, found at ### Accessibility Keyboard use can be challenging for individuals with various types of disabilities. For this revision, the committee is not evaluating features or architectural designs for the purpose of improving accessibility. Such consideration could be fruitful for future revisions. However, some points on this topic should be made: 1. Having an industry-wide standard format for keyboards will enable accessibility software to make use of keyboard data with a reduced dependence on platform-specific knowledge. -2. Some features, such as multiTap and flicks, have the potential to reduce accessibility and thus should be discouraged. For example, multiTap requires pressing keys at a certain speed, and flicks require a more complex movement (press-and-flick) beyond a simple tap. Alternatively, inclusion of accessible methods of generating the same outputs (for example, simple keys on an additional layer), should be considered. +2. Features which require certain levels of mobility or speed of entry should be considered for their impact on accessibility. This impact could be mitigated by means of additional, accessible methods of generating the same output. 3. Public feedback is welcome on any aspects of this document which might hinder accessibility. ## Definitions @@ -352,7 +353,9 @@ For purposes of this current draft specification, the value should always be `te _Attribute:_ `locale` (required) -This attribute represents the primary locale of the keyboard using BCP 47 [Unicode locale identifiers](tr35.md#Canonical_Unicode_Locale_Identifiers) - for example `"el"` for Greek. Sometimes, the locale may not specify the base language. For example, a Devanagari keyboard for many languages could be specified by BCP-47 code: `"mul-Deva"`. For further details, see [Keyboard IDs](#Keyboard_IDs). +This attribute represents the primary locale of the keyboard using BCP 47 [Unicode locale identifiers](tr35.md#Canonical_Unicode_Locale_Identifiers) - for example `"el"` for Greek. Sometimes, the locale may not specify the base language. For example, a Devanagari keyboard for many languages could be specified by BCP-47 code: `"und-Deva"`. However, it is better to list out the languages explicitly using the [`locales`](#element-locales) element. + +For further details about the choice of locale ID, see [Keyboard IDs](#Keyboard_IDs). **Example** (for illustrative purposes only, not indicative of the real data) @@ -416,7 +419,7 @@ The optional `` element allows specifying additional or alternate local _Attribute:_ `id` (required) > The [BCP 47](tr35.md#Canonical_Unicode_Locale_Identifiers) locale ID of an additional language supported by this keyboard. -> Do _not_ include the `-k0-` subtag for this additional language. +> Must _not_ include the `-k0-` subtag for this additional language. **Example** @@ -482,10 +485,10 @@ Element containing informative properties about the layout, for displaying in us **Syntax** ```xml - + ``` > @@ -515,11 +518,15 @@ _Attribute:_ `normalization` _Attribute:_ `layout` > The `layout` attribute describes the layout pattern, such as QWERTY, DVORAK, INSCRIPT, etc. typically used to distinguish various layouts for the same language. +> +> This attribute is not localized, but is an informative identifier for implementation use. _Attribute:_ `indicator` -> The `indicator` attribute describes a short string to be used in currently selected layout indicator, such as US, SI9 etc. +> The `indicator` attribute describes a short string to be used in currently selected layout indicator, such as `US`, `SI9` etc. > Typically, this is shown on a UI element that allows switching keyboard layouts and/or input languages. +> +> This attribute is not localized. * * * @@ -527,7 +534,7 @@ _Attribute:_ `indicator` Element used to store any names given to the layout. -These names are not currently localized. +These names are not localized but are informative names for the keyboard. Localization of these names would be done as separate data items elsewhere in CLDR. **Syntax** @@ -588,12 +595,12 @@ _Attribute:_ `value` (required) ### Element: settings -An element used to keep track of layout specific settings. This element may or may not show up on a layout. These settings reflect the normal practice by the implementation. However, an implementation using the data may customize the behavior. +An element used to keep track of layout-specific settings by implementations. This element may or may not show up on a layout. These settings reflect the normal practice by the implementation. However, an implementation using the data may customize the behavior. **Syntax** ```xml - + ``` > @@ -612,7 +619,6 @@ _Attribute:_ `fallback="omit"` If this attribute is present, it must have a value of omit. - **Example** ```xml @@ -626,7 +632,7 @@ If this attribute is present, it must have a value of omit. Indicates that: 1. When a modifier combination goes unmatched, do not output anything when a key is pressed. -2. If a transform is escaped, output the contents of the buffer. +2. If a transform is terminated, output the contents of the buffer. 3. During a transform, hide the contents of the buffer as the user is typing. * * * @@ -673,17 +679,17 @@ This element defines a mapping between an abstract key and its output. This elem ```xml + flicks="{flicks identifier}" + gap="true" + longPress="{long press keys}" + longPressDefault="{default longpress target}" + multiTap="{the output on subsequent taps}" + stretch="true" + switch="{layer id}" + to="{the output}" + transform="no" + width="{key width}" + /> ``` > @@ -705,7 +711,7 @@ _Attribute:_ `id` > > In the future, this attribute’s definition is expected to be updated to align with [UAX#31](https://www.unicode.org/reports/tr31/). Please see [CLDR-17043](https://unicode-org.atlassian.net/browse/CLDR-17043) for more details. -_Attribute:_ `flicks="flick-id"` (optional) +_Attribute:_ `flicks="{flick id}"` (optional) > The `flicks` attribute indicates that this key makes use of a [`flicks`](#Element_flicks) set with the specified id. @@ -768,7 +774,7 @@ _Attribute:_ `to` _Attribute:_ `transform="no"` (optional) -> The `transform` attribute is used to define a key that does not participate in a transform (until the next keystroke). This attribute value must be `no` if the attribute is present. +> The `transform` attribute is used to define a key that does not participate in a transform (until the next keystroke). This attribute value must be `"no"` if the attribute is present. > This attribute is useful where it is desired to output where two different keys could output the same characters (with different key or modifier combinations) but only one of them is intended to participate in a transform. > When the next keystroke is pressed, the prior output may then combine using other transforms. > @@ -947,9 +953,9 @@ where a flick to the Northeast then South produces two code points. ### Element: import The `import` element is used to reference another xml file so that elements are imported from -another file. The use case is to be able to import a standard set of transforms, and similar -from the CLDR repository. `` is not recommended as a way for keyboard authors to -split up their keyboard into multiple files, as the intent is for each single XML file to contain all that is needed for a keyboard layout. +another file. The use case is to be able to import a standard set of `transform`s and similar +from the CLDR repository, especially to be able to share common information relevant to a particular script. +The intent is for each single XML file to contain all that is needed for a keyboard layout, other than required standard import data from the CLDR repository. `` can be used as a child of a number of elements (see the _Parents_ section immediately below). Multiple `` elements may be used, however, `` elements must come before any other sibling elements. If two identical elements are defined, the later element will take precedence, that is, override. @@ -978,6 +984,8 @@ _Attribute:_ `base` _Attribute:_ `path` (required) > If `base` is `cldr`, then the `path` must start with a CLDR version (such as `techpreview`) representing the CLDR version to pull imports from. The imports are located in the `keyboard/import` subdirectory of the CLDR source repository. +> Implementations are not required to have all CLDR versions available to them. +> > If `base` is omitted, then `path` is an absolute or relative file path. @@ -1312,11 +1320,16 @@ _Attribute:_ `form` (required) > This attribute specifies the physical layout of a hardware keyboard, > or that the form is a `touch` layout. > -> It is recommended to always have at least one hardware (non-touch) form. +> When using an on-screen touch keyboard, if the keyboard does not specify a `` +> element, a `` element can be used as an fallback alternative. > If there is no `hardware` form, the implementation may need > to choose a different keyboard file, or use some other fallback behavior when using a > hardware keyboard. > +> Because a hardware keyboard facilitates non-trivial amounts of text input, +> and many touch devices can also be connected to a hardware keyboard, it +> is recommended to always have at least one hardware (non-touch) form. +> > Multiple `` elements are allowed with distinct `minDeviceWidth` values. > At most one hardware (non-`touch`) `` element is allowed. If a different key arrangement is desired between, for example, `us` and `iso` formats, these should be separated into two different keyboards. > @@ -1325,9 +1338,6 @@ _Attribute:_ `form` (required) > A mismatch between the hardware layout in the keyboard file, and the actual hardware used by the user could result in some keys being inaccessible to the user if their hardware cannot generate the scancodes corresponding to the layout specified by the `form=` attribute. Such keys could be accessed only via an on-screen keyboard utility. Conversely, a user with hardware keys that are not present in the specified `form=` will result in some hardware keys which have no function when pressed. > > -> When using an on-screen keyboard, if there is not a `` -> element, the hardware elements can be used for on-screen use. -> > The value of the `form=` attribute may be `touch`, or correspond to a `form` element. See [`form`](#element-form). > @@ -1370,11 +1380,13 @@ _Attribute:_ `modifier` (required for `hardware`) > This has two roles. It acts as an identifier for the `layer` element for hardware keyboards (in the absence of the id= element) and also provides the linkage from the hardware modifiers into the correct `layer`. > -> To indicate that no modifiers apply, the reserved name of `none` can be used. +> To indicate that no modifiers apply, the reserved name of `none` is used. > The following modifier components can be used, separated by spaces. > Note that `L` or `R` indicates a left- or right- side modifier only (such as `altL`) -> whereas `alt` indicates _either_ left or right alt key. -> `shift` also indicates either shift key. +> whereas `alt` indicates _either_ left or right alt key (that is, `altL` or `altR`). `ctrl` indicates either left or right ctrl key (that is, `ctrlL` or `ctrlR`). +> `shift` also indicates either shift key. The left and right shift keys are not distinguishable in this specification. +> +> If there is a layer with a modifier `alt`, there may not be another layer with `altL` or `altR`. Similarly, if there is a layer with a modifier `ctrl`, there may not be a layer with `ctrlL` or `ctrlR`. > > - `none` (no modifier, may not be combined with others) > - `alt` @@ -1386,13 +1398,15 @@ _Attribute:_ `modifier` (required for `hardware`) > - `ctrlR` > - `shift` > -> Note that `alt` is sometimes referred to as _opt_ or _option_. +> Note that `alt` in this specification is referred to on some platforms as "opt" or "option". > > Left- and right- side modifiers (such as `"altL ctrlR"` or `"altL altR"`) should not be used together in a single `modifier` attribute value. > > For hardware layouts, the use of `@modifier` as an identifier for a layer is sufficient since it is always unique among the set of `layer` elements in a keyboard. > > The set of modifiers must match `(none|([A-Za-z0-9]+)( [A-Za-z0-9]+)*)` +> +> To share a layer between two modifier sets, the layer data must be duplicated. * * * @@ -2074,11 +2088,11 @@ The relative ordering of `` elements is not significant. + before="{look-behind required match}" + order="{list of weights}" + tertiary="{list of weights}" + tertiaryBase="{list of true/false}" + preBase="{list of true/false}" /> ``` @@ -2323,7 +2337,7 @@ In text editing mode, different keyboard layouts may behave differently in the s ```xml - + ``` From 87375cd4c941e1c12e0e9605526683627327a374 Mon Sep 17 00:00:00 2001 From: Robert Bastian <4706271+robertbastian@users.noreply.github.com> Date: Tue, 3 Oct 2023 10:59:57 +0200 Subject: [PATCH 08/10] CLDR-16543 Fix exemplar cities (#2841) See #2841 --- common/main/en.xml | 21 --------------------- common/main/root.xml | 14 +++++++++++++- 2 files changed, 13 insertions(+), 22 deletions(-) diff --git a/common/main/en.xml b/common/main/en.xml index b873945ba80..9f50375123e 100644 --- a/common/main/en.xml +++ b/common/main/en.xml @@ -3904,15 +3904,6 @@ annotations. Unknown City - - Dumont d’Urville - - - St. Barthélemy - - - Curaçao - British Summer Time @@ -3926,21 +3917,9 @@ annotations. Kostanay - - Asunción - - - Réunion - - - São Tomé - Uzhhorod - - Kyiv - HST diff --git a/common/main/root.xml b/common/main/root.xml index 7fe90ec3365..56cf38d18d4 100644 --- a/common/main/root.xml +++ b/common/main/root.xml @@ -2899,7 +2899,7 @@ Warnings: All cp values have U+FE0F characters removed. See /annotationsDerived/ Dumont d’Urville - St. Barthelemy + St. Barthélemy Atikokan @@ -3006,6 +3006,18 @@ Warnings: All cp values have U+FE0F characters removed. See /annotationsDerived/ Ho Chi Minh + + Curaçao + + + Asunción + + + Réunion + + + São Tomé + From fb2b2fc45189038fdae444c3a5a0e8021a770977 Mon Sep 17 00:00:00 2001 From: Mark Davis Date: Tue, 3 Oct 2023 17:24:41 +0200 Subject: [PATCH 09/10] CLDR-17123 Document additional characterLabelPatterns (#3302) --- docs/ldml/tr35-general.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/ldml/tr35-general.md b/docs/ldml/tr35-general.md index 31ee64a8702..b8d2699cdc9 100644 --- a/docs/ldml/tr35-general.md +++ b/docs/ldml/tr35-general.md @@ -2657,7 +2657,9 @@ The following are character labels. Where the meaning of the label is fairly cle | limited_use | limited-use | Not in common modern use. | | male | male | Indicates that a character is male or masculine in appearance. | | modifier | modifier | A Unicode modifier letter or symbol. | -| nonspacing | nonspacing | Uses for characters that occupy no width by themselves, such as the ¨ over the a in ä. | +| nonspacing | nonspacing | Used for characters that occupy no width by themselves, such as the ¨ over the a in ä. | +| facing-left | facing-left | Characters that face to the left. Also used to construct names for emoji variants. | +| facing-right | facing-right | Characters that face to the right. Also used to construct names for emoji variants. | ### Typographic Names From 07ea202fa8c11aa48442b1a51062a5c2a208bdde Mon Sep 17 00:00:00 2001 From: Mark Davis Date: Tue, 3 Oct 2023 17:25:41 +0200 Subject: [PATCH 10/10] CLDR-17072 Document new unit systems (#3299) * CLDR-17072 Document new unit systems * CLDR-17072 Tweaks * CLDR-17072 Tweaks for Shanes comments (plus a bit of cleanup). * CLDR-17072 More tweaks * CLDR-17072 More formatting (wish I had a local .md viewer!) * CLDR-17072 more tweaks --- docs/ldml/tr35-general.md | 35 +++++++++++++----------- docs/ldml/tr35-info.md | 57 ++++++++++++++++++++++++++++++++------- 2 files changed, 68 insertions(+), 24 deletions(-) diff --git a/docs/ldml/tr35-general.md b/docs/ldml/tr35-general.md index b8d2699cdc9..d2456ffdbb4 100644 --- a/docs/ldml/tr35-general.md +++ b/docs/ldml/tr35-general.md @@ -919,7 +919,7 @@ Some of the constraints reference data from the unitIdComponents in [Unit_Conver
  • per-second
  • Note: The normalized form will have only one "per"
  • -
  • Note:The token 'per' is the single value in <unitIdComponent type=”per”>
  • +
  • Note: The token 'per' is the single value in <unitIdComponent type=”per”>
  • product_unit:= @@ -954,7 +954,7 @@ Some of the constraints reference data from the unitIdComponents in [Unit_Conver "square-"

    | "cubic-"

    | "pow" ([2-9]|1[0-5]) "-"

    • Note: "pow2-" and "pow3-" canonicalize to "square-" and "cubic-"
    • -
    • Note:These are values in <unitIdComponent type=”power”>
    • +
    • Note: These are values in <unitIdComponent type=”power”>
    simple_unit:= @@ -975,11 +975,11 @@ Some of the constraints reference data from the unitIdComponents in [Unit_Conver si_prefix:= "deka" | "hecto" | "kilo", … - + binary_prefix:= "kibi", "mebi", … - + prefix_component:= [a-z]{3,∞} @@ -993,11 +993,12 @@ Some of the constraints reference data from the unitIdComponents in [Unit_Conver or <unitIdComponent type=”power”>
    or <unitIdComponent type=”and”>
    or <unitIdComponent type=”per”>. - -
    • Constraint: must not have a prefix as an initial segment.
    • -
    • Constraint: no two different base_components will share the first 8 letters. +
    • +
    • Constraint: must not have a prefix as an initial segment.
    • +
    • Constraint: no two different base_components will share the first 8 letters. (For more information, see Unit Identifier Uniqueness.) -
    + + suffix_component:= @@ -1007,7 +1008,7 @@ Some of the constraints reference data from the unitIdComponents in [Unit_Conver mixed_unit_identifier:= (single_unit | pu_single_unit) ("-and-" (single_unit | pu_single_unit ))*
    • Example: foot-and-inch
    • -
    • Note:The token 'and' is the single value in <unitIdComponent type=”and”>
    • +
    • Note: The token 'and' is the single value in <unitIdComponent type=”and”>
    long_unit_identifier:= @@ -1018,15 +1019,19 @@ Some of the constraints reference data from the unitIdComponents in [Unit_Conver currency_unit:= "curr-" [a-z]{3} -
    • Constraints: -
      • The first part of the currency_unit is a standard prefix; the second part of the currency unit must be a valid Unicode currency identifier. Note: CLDR does not provide conversions for currencies; this is only intended for formatting.
      • -
    • -
    • Examples: curr-eur-per-square-meter, or pound-per-curr-usd
    • -
    +
      +
    • Constraint: The first part of the currency_unit is a standard prefix; the second part of the currency unit must be a valid Unicode currency identifier.
    • +
    +
      +
    • Examples: curr-eur-per-square-meter, or pound-per-curr-usd
    • +
    • Note: CLDR does not provide conversions for currencies; this is only intended for formatting. + The locale data for currencies is supplied in the currencies element, not in the units element.
    • +
    + -Note that while the syntax allows for number_prefixes in multiple places, the typical use case is only one instances, and after a "-per-". +Note that while the syntax allows for number_prefixes in multiple places, the typical use case is only one instance, after a "-per-". The simple_unit structure does not allow for any two simple_units to overlap. That is, there are no cases where simple_unit1 consists of X-Y and simple_unit2 consists of Y-Z. diff --git a/docs/ldml/tr35-info.md b/docs/ldml/tr35-info.md index a3c7c3274e8..e07b9eee996 100644 --- a/docs/ldml/tr35-info.md +++ b/docs/ldml/tr35-info.md @@ -895,14 +895,53 @@ The factor and offset can be simple expressions, just like the values in the uni Where a factor is not present, the value is 1; where an offset is not present, the value is 0. -The `systems` attribute indicates the measurement system(s). Multiple values may be given; for example, _minute_ is marked as systems="metric ussystem uksystem" - -Attribute Value | Description ------------- | ------------- -_si_ | the _International System of Units (SI)_ -_metric_ | a superset of the _si_ units, with some non-SI units accepted for use with the SI or simple multiples of metric units, such as pound-metric (= ½ kilogram) -_ussystem_ | the inch-pound system as used in the US, also called _US Customary Units_ -_uksystem_ | the inch-pound system as used in the UK, also called _British Imperial Units_, differing mostly in units of volume +The `systems` attribute indicates the measurement system(s) or other characteristics of a set of unts. Multiple values may be given; for example, a unit could be marked as systems="`si_acceptable` `metric_adjacent` `prefixable`". + +The allowed attributes are the following: + +Attribute Value | Description +------------ | ------------- +`si` | The _International System of Units (SI)_ See [NIST Guide to the SI, Chapter 4: The Two Classes of SI Units and the SI Prefixes](https://www.nist.gov/pml/special-publication-811/nist-guide-si-chapter-4-two-classes-si-units-and-si-prefixes). Examples: meter, ampere. +`si_acceptable` | Units acceptable for use with the SI. See [NIST Guide to the SI, Chapter 5: Units Outside the SI](https://www.nist.gov/pml/special-publication-811/nist-guide-si-chapter-5-units-outside-si). Examples: hour, liter, knot, hectare. +`metric` | A superset of the _si_ units +`metric_adjacent` | Units commonly accepted in some countries that follow the metric system. Examples: month, arc-second, pound-metric (= ½ kilogram), mile-scandinavian. +`ussystem` | The inch-pound system as used in the US, also called _US Customary Units_. +`uksystem` | The inch-pound system as used in the UK, also called _British Imperial Units_, differing mostly in units of volume +`jpsystem` | Traditional units used in Japan. For examples, see [Japanese units of measurement](https://en.wikipedia.org/wiki/Japanese_units_of_measurement). +`astronomical` | Additional units used in astronomy. Examples: parsec, light-year, earth-mass +`person_age` | Special units used for people’s ages in some languages. Except for translation, they have the same system as the associated regular units. +`currency` | Currency units. These are constructed algorithmically from the Unicode currency identifiers, and do not occur in the child elements of `convertUnits`. Examples: curr-usd (US dollar), curr-eur (Euro). +`prefixable` | Those units that typically use SI prefixes or the [IEC binary prefixes](https://www.nist.gov/pml/special-publication-811/nist-guide-si-appendix-d-bibliography#05). This can include measures like `parsec` that are not SI units. It allows implementations to group those units together, and to do sanity checks on the prefix+unit combinations, if they choose. However, implementations may choose to allow prefixes on other units, especially since there is a significant variance in usage: even a term like `megafoot` might be acceptable in some contexts. + +Over time, additional systems may be added, and the systems for a particular unit may be refined. + +#### Derived Unit System + +The systems attributes also apply to compound units, and are computed in the following way. + +1. The `prefixable` system is only applicable to base_components, and is thus removed +2. The `number_prefixes`, `dimensionality_prefix`, `si_prefix`, and `binary_prefix` are ignored + * Example: systems(square-kilometer) = systems(meter) +3. Currency units have the `currency` system + * Example: systems(curr-usd) = {currency} +4. Units linked by `-and-`, `-per-`, and *adjacency* are resolved using a modified intersection, where: + 1. The intersection of {… si …} and {… si_acceptable … } is {… si_acceptable …} + 2. The intersection of {… metric …} and {… metric_adjacent … } is {… metric_adjacent …} + +Examples: +``` +systems(liter-per-hectare) + = {si_acceptable metric} ∪ {si_acceptable metric} + = {si_acceptable metric} +systems(meter-per-hectare) + = {si metric} ∩ {si_acceptable metric} + = {si_acceptable metric} +systems(mile-scandinavian-per-hour) + = {metric_adjacent} ∩ {si_acceptable metric_adjacent} + = {metric_adjacent} +``` + +#### Conversion Mechanisms CLDR follows conversion values where possible from: * [NIST Special Publication 1038](https://www.govinfo.gov/content/pkg/GOVPUB-C13-f10c2ff9e7af2091314396a2d53213e4/pdf/GOVPUB-C13-f10c2ff9e7af2091314396a2d53213e4.pdf) @@ -1023,7 +1062,7 @@ Examples: The order of the elements in the file is significant, since it is used in [Unit_Identifier_Normalization](#Unit_Identifier_Normalization). -The quantity values themselves are informative. Therer mayreflecting that _force per area_ can be referenced as either _pressure_ or _stress_, for example). The quantity for a complex unit that has a reciprocal is formed by prepending “inverse-” to the quantity, such as _inverse-consumption._ +The quantity values themselves are informative. For example, _force per area_ can be referenced as either _pressure_ or _stress_. The quantity for a complex unit that has a reciprocal is formed by prepending “inverse-” to the quantity, such as _inverse-consumption._ The base units for the quantities and the quantities themselves are based on [NIST Special Publication 811](https://www.nist.gov/pml/special-publication-811) and the earlier [NIST Special Publication 1038](https://www.govinfo.gov/content/pkg/GOVPUB-C13-f10c2ff9e7af2091314396a2d53213e4/pdf/GOVPUB-C13-f10c2ff9e7af2091314396a2d53213e4.pdf). In some cases, a different unit is chosen for the base. For example, a _revolution_ (360°) is chosen for the base unit for angles instead of the SI _radian_, and _item_ instead of the SI _mole_. Additional base units are added where necessary, such as _bit_ and _pixel_.