From 9de952836fa17ce717b840b8f02947fbfa6b95c9 Mon Sep 17 00:00:00 2001 From: Peter Edberg <42151464+pedberg-icu@users.noreply.github.com> Date: Tue, 2 Apr 2024 22:23:27 -0700 Subject: [PATCH] CLDR-16034 Check for pairing bidi markup chars (illegal); remove 2 (unpaired) RLO in ff_Adlm (#3606) --- common/main/ff_Adlm.xml | 4 ++-- .../java/org/unicode/cldr/test/CheckCLDR.java | 3 ++- .../unicode/cldr/test/CheckForExemplars.java | 23 +++++++++++++++++-- 3 files changed, 25 insertions(+), 5 deletions(-) diff --git a/common/main/ff_Adlm.xml b/common/main/ff_Adlm.xml index 02764f04b2b..741d16d3faf 100644 --- a/common/main/ff_Adlm.xml +++ b/common/main/ff_Adlm.xml @@ -716,7 +716,7 @@ CLDR data files are interpreted according to the LDML specification (http://unic 𞤄𞤢𞤸𞤢𞤥𞤢𞥄𞤧 𞤄𞤵𞥅𞤼𞤢𞥄𞤲 𞤅𞤵𞤪𞤭𞥅𞤪𞤫 𞤄𞤵𞥅𞤾𞤫𞥅 - ‮𞤄𞤮𞤼𞤧𞤵𞤱𞤢𞥄𞤲𞤢 + 𞤄𞤮𞤼𞤧𞤵𞤱𞤢𞥄𞤲𞤢 𞤄𞤫𞤤𞤢𞤪𞤵𞥅𞤧 𞤄𞤫𞤤𞤭𞥅𞥁 𞤑𞤢𞤲𞤢𞤣𞤢𞥄 @@ -8603,7 +8603,7 @@ CLDR data files are interpreted according to the LDML specification (http://unic 𞤐𞤵𞥅𞤳 - ‮𞤋𞤼𞥆𞤮𞤳𞤮𞤪𞤼𞤮𞥅𞤪𞤥𞤭𞥅𞤼 + 𞤋𞤼𞥆𞤮𞤳𞤮𞤪𞤼𞤮𞥅𞤪𞤥𞤭𞥅𞤼 𞤁𞤢𞥄𞤲𞤥𞤢𞤪𞤳𞥃𞤢𞥄𞤾𞤲 diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/test/CheckCLDR.java b/tools/cldr-code/src/main/java/org/unicode/cldr/test/CheckCLDR.java index 6f6202ac9c0..3640f1bd7e1 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/test/CheckCLDR.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/test/CheckCLDR.java @@ -902,7 +902,8 @@ public enum Subtype { namePlaceholderProblem, missingSpaceBetweenNameFields, illegalParameterValue, - illegalAnnotationCode; + illegalAnnotationCode, + illegalCharacter; @Override public String toString() { diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/test/CheckForExemplars.java b/tools/cldr-code/src/main/java/org/unicode/cldr/test/CheckForExemplars.java index b7379b3f176..8b8df2b397f 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/test/CheckForExemplars.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/test/CheckForExemplars.java @@ -51,8 +51,10 @@ import org.unicode.cldr.util.XPathParts; public class CheckForExemplars extends FactoryCheckCLDR { - private static final UnicodeSet RTL_CONTROLS = - new UnicodeSet("[\\u061C\\u200E\\u200F\\u202A-\\u202D\\u2066-\\u2069]"); + private static final UnicodeSet RTL_CONTROLS = new UnicodeSet("[\\u061C\\u200E\\u200F]"); + + private static final UnicodeSet ILLEGAL_RTL_CONTROLS = + new UnicodeSet("[\\u202A-\\u202E\\u2066-\\u2069]"); private static final UnicodeSet RTL = new UnicodeSet("[[:bc=AL:][:bc=R:]]"); @@ -318,6 +320,9 @@ public CheckCLDR handleCheck( // if (path.indexOf("/calendar") >= 0 && path.indexOf("gregorian") <= 0) return this; } + // Check all paths for illegal characters, even EXEMPLAR_SKIPS + checkIllegalCharacters(path, value, result); + if (containsPart(path, EXEMPLAR_SKIPS)) { return this; } @@ -568,6 +573,20 @@ public CheckCLDR handleCheck( return this; } + // Check for characters that are always illegal in values. + // Currently those are just the paired bidi marks. + private void checkIllegalCharacters(String path, String value, List result) { + if (ILLEGAL_RTL_CONTROLS.containsSome(value)) { + result.add( + new CheckStatus() + .setCause(this) + .setMainType(CheckStatus.errorType) + .setSubtype(Subtype.illegalCharacter) + .setMessage( + "Bidi markup can only include LRM RLM ALM, not paired characters such as FSI PDI")); + } + } + private String checkAndReplacePlaceholders( String path, String value, List result) { CheckStatus.Type statusType =