From f65ed6efad45c2f72694ce55d40daf4cf5bbd5bc Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Fri, 17 May 2024 15:56:43 -0700 Subject: [PATCH] UTS46 test data fixes for 175-A88 --- unicodetools/data/idna/dev/IdnaTestV2.txt | 30 +++++++++++-------- .../org/unicode/idna/GenerateIdnaTest.java | 2 ++ .../src/main/java/org/unicode/idna/Uts46.java | 1 + .../org/unicode/idna/IdnaTestHeader2.txt | 26 +++++++++------- 4 files changed, 36 insertions(+), 23 deletions(-) diff --git a/unicodetools/data/idna/dev/IdnaTestV2.txt b/unicodetools/data/idna/dev/IdnaTestV2.txt index cdc2943ab..981f93d86 100644 --- a/unicodetools/data/idna/dev/IdnaTestV2.txt +++ b/unicodetools/data/idna/dev/IdnaTestV2.txt @@ -1,5 +1,5 @@ # IdnaTestV2.txt -# Date: 2024-05-15, 04:28:51 GMT +# Date: 2024-05-17, 22:51:05 GMT # © 2024 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use and license, see https://www.unicode.org/terms_of_use.html @@ -75,29 +75,33 @@ # Implementations that allow values of particular input flags to be false would ignore # the corresponding status codes listed in the table below when testing for errors. # -# VerifyDnsLength: P4 +# VerifyDnsLength: A4_1, A4_2 # CheckHyphens: V2, V3 # CheckJoiners: Cn # CheckBidi: Bn # UseSTD3ASCIIRules: U1 # +# Implementations that cannot work with ill-formed strings would skip test cases that contain them. +# For example, the status code A3 is set for a Punycode encoding error, +# which may be due to an unpaired surrogate. +# # Implementations may be more strict than the default settings for UTS #46. -# In particular, an implementation conformant to IDNA2008 would disallow the input for lines -# marked with NV8. +# In particular, an implementation conformant to IDNA2008 would skip any line in this test file that +# contained a character in the toUnicode field that has the IDNA2008 Status value NV8 or XV8 +# in IdnaMappingTable.txt. +# For example, it would skip a line containing ¢ (U+00A2 CENT SIGN) in the toUnicode field, +# because of the following line in IdnaMappingTable.txt: +# +# 00A1..00A7 ; valid ; ; NV8 # 1.1 INVERTED EXCLAMATION MARK..SECTION SIGN # # Implementations need only record that there is an error: they need not reproduce the # precise status codes (after removing the ignored status values). # # Compatibility errors # -# The special error codes X3 and X4_2 are now returned where a toASCII error code -# was formerly being generated in toUnicode due to an empty label. -# -# A3 was being generated in the following cases (in addition to its normal usage). -# • an empty label in toUnicode. In this case, it is replaced by X3. -# -# A4_2 was being generated in the following case (in addition to its normal usage). -# • an empty label in V8 (CheckBidi). In this case, it is being replaced by X4_2. +# The special error code X4_2 is now returned where a toASCII error code +# was formerly being generated in toUnicode due to an empty label: +# A4_2 was being generated for an empty label in CheckBidi (in addition to A4_2’s normal usage). # ============================================================================================ fass.de; ; ; ; ; ; # fass.de faß.de; ; ; xn--fa-hia.de; ; fass.de; # faß.de @@ -542,6 +546,8 @@ $; ; [V7]; ; ; ; # $ ⑷.FOUR; ⑷.four; [V7]; xn--csh.four; ; ; # ⑷.four ⑷.Four; ⑷.four; [V7]; xn--csh.four; ; ; # ⑷.four xn--csh.four; ⑷.four; [V7]; xn--csh.four; ; ; # ⑷.four +a\uD900z; ; [V7]; ; [V7, A3]; ; # az +A\uD900Z; a\uD900z; [V7]; ; [V7, A3]; ; # az # RANDOMIZED TESTS diff --git a/unicodetools/src/main/java/org/unicode/idna/GenerateIdnaTest.java b/unicodetools/src/main/java/org/unicode/idna/GenerateIdnaTest.java index dd42975e6..ecd39b056 100644 --- a/unicodetools/src/main/java/org/unicode/idna/GenerateIdnaTest.java +++ b/unicodetools/src/main/java/org/unicode/idna/GenerateIdnaTest.java @@ -888,5 +888,7 @@ public static UnicodeSet getIdna2008Valid() { "\u2477.four", // parentheses are disallowed_STD3_valid "(4).four", + // Ill-formed string with an unpaired surrogate. Punycode.encode() fails, and we report A3. + "a" + (char)0xD900 + "z", }; } diff --git a/unicodetools/src/main/java/org/unicode/idna/Uts46.java b/unicodetools/src/main/java/org/unicode/idna/Uts46.java index cd410053b..74553ca6c 100644 --- a/unicodetools/src/main/java/org/unicode/idna/Uts46.java +++ b/unicodetools/src/main/java/org/unicode/idna/Uts46.java @@ -362,6 +362,7 @@ public static boolean hasContextJError(String domain, Set errors) { */ protected String fromPunycode(String label, Set errors) { if (label.isEmpty()) { + // Impossible as long as this function is only called when label.startsWith("xn--"). errors.add(Errors.X3); return label; } diff --git a/unicodetools/src/main/resources/org/unicode/idna/IdnaTestHeader2.txt b/unicodetools/src/main/resources/org/unicode/idna/IdnaTestHeader2.txt index 1ab9668bb..709e3ccf5 100644 --- a/unicodetools/src/main/resources/org/unicode/idna/IdnaTestHeader2.txt +++ b/unicodetools/src/main/resources/org/unicode/idna/IdnaTestHeader2.txt @@ -64,27 +64,31 @@ # Implementations that allow values of particular input flags to be false would ignore # the corresponding status codes listed in the table below when testing for errors. # -# VerifyDnsLength: P4 +# VerifyDnsLength: A4_1, A4_2 # CheckHyphens: V2, V3 # CheckJoiners: Cn # CheckBidi: Bn # UseSTD3ASCIIRules: U1 # +# Implementations that cannot work with ill-formed strings would skip test cases that contain them. +# For example, the status code A3 is set for a Punycode encoding error, +# which may be due to an unpaired surrogate. +# # Implementations may be more strict than the default settings for UTS #46. -# In particular, an implementation conformant to IDNA2008 would disallow the input for lines -# marked with NV8. +# In particular, an implementation conformant to IDNA2008 would skip any line in this test file that +# contained a character in the toUnicode field that has the IDNA2008 Status value NV8 or XV8 +# in IdnaMappingTable.txt. +# For example, it would skip a line containing ¢ (U+00A2 CENT SIGN) in the toUnicode field, +# because of the following line in IdnaMappingTable.txt: +# +# 00A1..00A7 ; valid ; ; NV8 # 1.1 INVERTED EXCLAMATION MARK..SECTION SIGN # # Implementations need only record that there is an error: they need not reproduce the # precise status codes (after removing the ignored status values). # # Compatibility errors # -# The special error codes X3 and X4_2 are now returned where a toASCII error code -# was formerly being generated in toUnicode due to an empty label. -# -# A3 was being generated in the following cases (in addition to its normal usage). -# • an empty label in toUnicode. In this case, it is replaced by X3. -# -# A4_2 was being generated in the following case (in addition to its normal usage). -# • an empty label in V8 (CheckBidi). In this case, it is being replaced by X4_2. +# The special error code X4_2 is now returned where a toASCII error code +# was formerly being generated in toUnicode due to an empty label: +# A4_2 was being generated for an empty label in CheckBidi (in addition to A4_2’s normal usage). # ============================================================================================