From f65ed6efad45c2f72694ce55d40daf4cf5bbd5bc Mon Sep 17 00:00:00 2001
From: Markus Scherer <markus.icu@gmail.com>
Date: Fri, 17 May 2024 15:56:43 -0700
Subject: [PATCH] UTS46 test data fixes for 175-A88

---
 unicodetools/data/idna/dev/IdnaTestV2.txt     | 30 +++++++++++--------
 .../org/unicode/idna/GenerateIdnaTest.java    |  2 ++
 .../src/main/java/org/unicode/idna/Uts46.java |  1 +
 .../org/unicode/idna/IdnaTestHeader2.txt      | 26 +++++++++-------
 4 files changed, 36 insertions(+), 23 deletions(-)

diff --git a/unicodetools/data/idna/dev/IdnaTestV2.txt b/unicodetools/data/idna/dev/IdnaTestV2.txt
index cdc2943ab..981f93d86 100644
--- a/unicodetools/data/idna/dev/IdnaTestV2.txt
+++ b/unicodetools/data/idna/dev/IdnaTestV2.txt
@@ -1,5 +1,5 @@
 # IdnaTestV2.txt
-# Date: 2024-05-15, 04:28:51 GMT
+# Date: 2024-05-17, 22:51:05 GMT
 # © 2024 Unicode®, Inc.
 # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
 # For terms of use and license, see https://www.unicode.org/terms_of_use.html
@@ -75,29 +75,33 @@
 # Implementations that allow values of particular input flags to be false would ignore
 # the corresponding status codes listed in the table below when testing for errors.
 #
-# VerifyDnsLength:   P4
+# VerifyDnsLength:   A4_1, A4_2
 # CheckHyphens:      V2, V3
 # CheckJoiners:      Cn
 # CheckBidi:         Bn
 # UseSTD3ASCIIRules: U1
 #
+# Implementations that cannot work with ill-formed strings would skip test cases that contain them.
+# For example, the status code A3 is set for a Punycode encoding error,
+# which may be due to an unpaired surrogate.
+#
 # Implementations may be more strict than the default settings for UTS #46.
-# In particular, an implementation conformant to IDNA2008 would disallow the input for lines
-# marked with NV8.
+# In particular, an implementation conformant to IDNA2008 would skip any line in this test file that
+# contained a character in the toUnicode field that has the IDNA2008 Status value NV8 or XV8
+# in IdnaMappingTable.txt.
+# For example, it would skip a line containing ¢ (U+00A2 CENT SIGN) in the toUnicode field, 
+# because of the following line in IdnaMappingTable.txt:
+#
+# 00A1..00A7    ; valid                  ;      ; NV8    # 1.1  INVERTED EXCLAMATION MARK..SECTION SIGN
 #
 # Implementations need only record that there is an error: they need not reproduce the
 # precise status codes (after removing the ignored status values).
 #
 # Compatibility errors
 #
-# The special error codes X3 and X4_2 are now returned where a toASCII error code
-# was formerly being generated in toUnicode due to an empty label.
-#
-# A3 was being generated in the following cases (in addition to its normal usage).
-#   • an empty label in toUnicode. In this case, it is replaced by X3.
-#
-# A4_2 was being generated in the following case (in addition to its normal usage).
-#   • an empty label in V8 (CheckBidi). In this case, it is being replaced by X4_2.
+# The special error code X4_2 is now returned where a toASCII error code
+# was formerly being generated in toUnicode due to an empty label:
+# A4_2 was being generated for an empty label in CheckBidi (in addition to A4_2’s normal usage).
 # ============================================================================================
 fass.de; ; ; ; ; ;  # fass.de
 faß.de; ; ; xn--fa-hia.de; ; fass.de;  # faß.de
@@ -542,6 +546,8 @@ $; ; [V7]; ; ; ;  # $
 ⑷.FOUR; ⑷.four; [V7]; xn--csh.four; ; ;  # ⑷.four
 ⑷.Four; ⑷.four; [V7]; xn--csh.four; ; ;  # ⑷.four
 xn--csh.four; ⑷.four; [V7]; xn--csh.four; ; ;  # ⑷.four
+a\uD900z; ; [V7]; ; [V7, A3]; ;  # az
+A\uD900Z; a\uD900z; [V7]; ; [V7, A3]; ;  # az
 
 # RANDOMIZED TESTS
 
diff --git a/unicodetools/src/main/java/org/unicode/idna/GenerateIdnaTest.java b/unicodetools/src/main/java/org/unicode/idna/GenerateIdnaTest.java
index dd42975e6..ecd39b056 100644
--- a/unicodetools/src/main/java/org/unicode/idna/GenerateIdnaTest.java
+++ b/unicodetools/src/main/java/org/unicode/idna/GenerateIdnaTest.java
@@ -888,5 +888,7 @@ public static UnicodeSet getIdna2008Valid() {
         "\u2477.four",
         // parentheses are disallowed_STD3_valid
         "(4).four",
+        // Ill-formed string with an unpaired surrogate. Punycode.encode() fails, and we report A3.
+        "a" + (char)0xD900 + "z",
     };
 }
diff --git a/unicodetools/src/main/java/org/unicode/idna/Uts46.java b/unicodetools/src/main/java/org/unicode/idna/Uts46.java
index cd410053b..74553ca6c 100644
--- a/unicodetools/src/main/java/org/unicode/idna/Uts46.java
+++ b/unicodetools/src/main/java/org/unicode/idna/Uts46.java
@@ -362,6 +362,7 @@ public static boolean hasContextJError(String domain, Set<Errors> errors) {
      */
     protected String fromPunycode(String label, Set<Errors> errors) {
         if (label.isEmpty()) {
+            // Impossible as long as this function is only called when label.startsWith("xn--").
             errors.add(Errors.X3);
             return label;
         }
diff --git a/unicodetools/src/main/resources/org/unicode/idna/IdnaTestHeader2.txt b/unicodetools/src/main/resources/org/unicode/idna/IdnaTestHeader2.txt
index 1ab9668bb..709e3ccf5 100644
--- a/unicodetools/src/main/resources/org/unicode/idna/IdnaTestHeader2.txt
+++ b/unicodetools/src/main/resources/org/unicode/idna/IdnaTestHeader2.txt
@@ -64,27 +64,31 @@
 # Implementations that allow values of particular input flags to be false would ignore
 # the corresponding status codes listed in the table below when testing for errors.
 #
-# VerifyDnsLength:   P4
+# VerifyDnsLength:   A4_1, A4_2
 # CheckHyphens:      V2, V3
 # CheckJoiners:      Cn
 # CheckBidi:         Bn
 # UseSTD3ASCIIRules: U1
 #
+# Implementations that cannot work with ill-formed strings would skip test cases that contain them.
+# For example, the status code A3 is set for a Punycode encoding error,
+# which may be due to an unpaired surrogate.
+#
 # Implementations may be more strict than the default settings for UTS #46.
-# In particular, an implementation conformant to IDNA2008 would disallow the input for lines
-# marked with NV8.
+# In particular, an implementation conformant to IDNA2008 would skip any line in this test file that
+# contained a character in the toUnicode field that has the IDNA2008 Status value NV8 or XV8
+# in IdnaMappingTable.txt.
+# For example, it would skip a line containing ¢ (U+00A2 CENT SIGN) in the toUnicode field, 
+# because of the following line in IdnaMappingTable.txt:
+#
+# 00A1..00A7    ; valid                  ;      ; NV8    # 1.1  INVERTED EXCLAMATION MARK..SECTION SIGN
 #
 # Implementations need only record that there is an error: they need not reproduce the
 # precise status codes (after removing the ignored status values).
 #
 # Compatibility errors
 #
-# The special error codes X3 and X4_2 are now returned where a toASCII error code
-# was formerly being generated in toUnicode due to an empty label.
-#
-# A3 was being generated in the following cases (in addition to its normal usage).
-#   • an empty label in toUnicode. In this case, it is replaced by X3.
-#
-# A4_2 was being generated in the following case (in addition to its normal usage).
-#   • an empty label in V8 (CheckBidi). In this case, it is being replaced by X4_2.
+# The special error code X4_2 is now returned where a toASCII error code
+# was formerly being generated in toUnicode due to an empty label:
+# A4_2 was being generated for an empty label in CheckBidi (in addition to A4_2’s normal usage).
 # ============================================================================================