From 9de952836fa17ce717b840b8f02947fbfa6b95c9 Mon Sep 17 00:00:00 2001
From: Peter Edberg <42151464+pedberg-icu@users.noreply.github.com>
Date: Tue, 2 Apr 2024 22:23:27 -0700
Subject: [PATCH] CLDR-16034 Check for pairing bidi markup chars (illegal);
remove 2 (unpaired) RLO in ff_Adlm (#3606)
---
common/main/ff_Adlm.xml | 4 ++--
.../java/org/unicode/cldr/test/CheckCLDR.java | 3 ++-
.../unicode/cldr/test/CheckForExemplars.java | 23 +++++++++++++++++--
3 files changed, 25 insertions(+), 5 deletions(-)
diff --git a/common/main/ff_Adlm.xml b/common/main/ff_Adlm.xml
index 02764f04b2b..741d16d3faf 100644
--- a/common/main/ff_Adlm.xml
+++ b/common/main/ff_Adlm.xml
@@ -716,7 +716,7 @@ CLDR data files are interpreted according to the LDML specification (http://unic
𞤄𞤢𞤸𞤢𞤥𞤢𞥄𞤧
𞤄𞤵𞥅𞤼𞤢𞥄𞤲
𞤅𞤵𞤪𞤭𞥅𞤪𞤫 𞤄𞤵𞥅𞤾𞤫𞥅
- 𞤄𞤮𞤼𞤧𞤵𞤱𞤢𞥄𞤲𞤢
+ 𞤄𞤮𞤼𞤧𞤵𞤱𞤢𞥄𞤲𞤢
𞤄𞤫𞤤𞤢𞤪𞤵𞥅𞤧
𞤄𞤫𞤤𞤭𞥅𞥁
𞤑𞤢𞤲𞤢𞤣𞤢𞥄
@@ -8603,7 +8603,7 @@ CLDR data files are interpreted according to the LDML specification (http://unic
𞤐𞤵𞥅𞤳
- 𞤋𞤼𞥆𞤮𞤳𞤮𞤪𞤼𞤮𞥅𞤪𞤥𞤭𞥅𞤼
+ 𞤋𞤼𞥆𞤮𞤳𞤮𞤪𞤼𞤮𞥅𞤪𞤥𞤭𞥅𞤼
𞤁𞤢𞥄𞤲𞤥𞤢𞤪𞤳𞥃𞤢𞥄𞤾𞤲
diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/test/CheckCLDR.java b/tools/cldr-code/src/main/java/org/unicode/cldr/test/CheckCLDR.java
index 6f6202ac9c0..3640f1bd7e1 100644
--- a/tools/cldr-code/src/main/java/org/unicode/cldr/test/CheckCLDR.java
+++ b/tools/cldr-code/src/main/java/org/unicode/cldr/test/CheckCLDR.java
@@ -902,7 +902,8 @@ public enum Subtype {
namePlaceholderProblem,
missingSpaceBetweenNameFields,
illegalParameterValue,
- illegalAnnotationCode;
+ illegalAnnotationCode,
+ illegalCharacter;
@Override
public String toString() {
diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/test/CheckForExemplars.java b/tools/cldr-code/src/main/java/org/unicode/cldr/test/CheckForExemplars.java
index b7379b3f176..8b8df2b397f 100644
--- a/tools/cldr-code/src/main/java/org/unicode/cldr/test/CheckForExemplars.java
+++ b/tools/cldr-code/src/main/java/org/unicode/cldr/test/CheckForExemplars.java
@@ -51,8 +51,10 @@
import org.unicode.cldr.util.XPathParts;
public class CheckForExemplars extends FactoryCheckCLDR {
- private static final UnicodeSet RTL_CONTROLS =
- new UnicodeSet("[\\u061C\\u200E\\u200F\\u202A-\\u202D\\u2066-\\u2069]");
+ private static final UnicodeSet RTL_CONTROLS = new UnicodeSet("[\\u061C\\u200E\\u200F]");
+
+ private static final UnicodeSet ILLEGAL_RTL_CONTROLS =
+ new UnicodeSet("[\\u202A-\\u202E\\u2066-\\u2069]");
private static final UnicodeSet RTL = new UnicodeSet("[[:bc=AL:][:bc=R:]]");
@@ -318,6 +320,9 @@ public CheckCLDR handleCheck(
// if (path.indexOf("/calendar") >= 0 && path.indexOf("gregorian") <= 0) return this;
}
+ // Check all paths for illegal characters, even EXEMPLAR_SKIPS
+ checkIllegalCharacters(path, value, result);
+
if (containsPart(path, EXEMPLAR_SKIPS)) {
return this;
}
@@ -568,6 +573,20 @@ public CheckCLDR handleCheck(
return this;
}
+ // Check for characters that are always illegal in values.
+ // Currently those are just the paired bidi marks.
+ private void checkIllegalCharacters(String path, String value, List result) {
+ if (ILLEGAL_RTL_CONTROLS.containsSome(value)) {
+ result.add(
+ new CheckStatus()
+ .setCause(this)
+ .setMainType(CheckStatus.errorType)
+ .setSubtype(Subtype.illegalCharacter)
+ .setMessage(
+ "Bidi markup can only include LRM RLM ALM, not paired characters such as FSI PDI"));
+ }
+ }
+
private String checkAndReplacePlaceholders(
String path, String value, List result) {
CheckStatus.Type statusType =