Skip to content

Commit

Permalink
UTS46 test data fixes for 175-A88
Browse files Browse the repository at this point in the history
  • Loading branch information
markusicu committed May 17, 2024
1 parent a8e8f5e commit f65ed6e
Show file tree
Hide file tree
Showing 4 changed files with 36 additions and 23 deletions.
30 changes: 18 additions & 12 deletions unicodetools/data/idna/dev/IdnaTestV2.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# IdnaTestV2.txt
# Date: 2024-05-15, 04:28:51 GMT
# Date: 2024-05-17, 22:51:05 GMT
# © 2024 Unicode®, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
# For terms of use and license, see https://www.unicode.org/terms_of_use.html
Expand Down Expand Up @@ -75,29 +75,33 @@
# Implementations that allow values of particular input flags to be false would ignore
# the corresponding status codes listed in the table below when testing for errors.
#
# VerifyDnsLength: P4
# VerifyDnsLength: A4_1, A4_2
# CheckHyphens: V2, V3
# CheckJoiners: Cn
# CheckBidi: Bn
# UseSTD3ASCIIRules: U1
#
# Implementations that cannot work with ill-formed strings would skip test cases that contain them.
# For example, the status code A3 is set for a Punycode encoding error,
# which may be due to an unpaired surrogate.
#
# Implementations may be more strict than the default settings for UTS #46.
# In particular, an implementation conformant to IDNA2008 would disallow the input for lines
# marked with NV8.
# In particular, an implementation conformant to IDNA2008 would skip any line in this test file that
# contained a character in the toUnicode field that has the IDNA2008 Status value NV8 or XV8
# in IdnaMappingTable.txt.
# For example, it would skip a line containing ¢ (U+00A2 CENT SIGN) in the toUnicode field,
# because of the following line in IdnaMappingTable.txt:
#
# 00A1..00A7 ; valid ; ; NV8 # 1.1 INVERTED EXCLAMATION MARK..SECTION SIGN
#
# Implementations need only record that there is an error: they need not reproduce the
# precise status codes (after removing the ignored status values).
#
# Compatibility errors
#
# The special error codes X3 and X4_2 are now returned where a toASCII error code
# was formerly being generated in toUnicode due to an empty label.
#
# A3 was being generated in the following cases (in addition to its normal usage).
# • an empty label in toUnicode. In this case, it is replaced by X3.
#
# A4_2 was being generated in the following case (in addition to its normal usage).
# • an empty label in V8 (CheckBidi). In this case, it is being replaced by X4_2.
# The special error code X4_2 is now returned where a toASCII error code
# was formerly being generated in toUnicode due to an empty label:
# A4_2 was being generated for an empty label in CheckBidi (in addition to A4_2’s normal usage).
# ============================================================================================
fass.de; ; ; ; ; ; # fass.de
faß.de; ; ; xn--fa-hia.de; ; fass.de; # faß.de
Expand Down Expand Up @@ -542,6 +546,8 @@ $; ; [V7]; ; ; ; # $
⑷.FOUR; ⑷.four; [V7]; xn--csh.four; ; ; # ⑷.four
⑷.Four; ⑷.four; [V7]; xn--csh.four; ; ; # ⑷.four
xn--csh.four; ⑷.four; [V7]; xn--csh.four; ; ; # ⑷.four
a\uD900z; ; [V7]; ; [V7, A3]; ; # az
A\uD900Z; a\uD900z; [V7]; ; [V7, A3]; ; # az

# RANDOMIZED TESTS

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -888,5 +888,7 @@ public static UnicodeSet getIdna2008Valid() {
"\u2477.four",
// parentheses are disallowed_STD3_valid
"(4).four",
// Ill-formed string with an unpaired surrogate. Punycode.encode() fails, and we report A3.
"a" + (char)0xD900 + "z",
};
}
1 change: 1 addition & 0 deletions unicodetools/src/main/java/org/unicode/idna/Uts46.java
Original file line number Diff line number Diff line change
Expand Up @@ -362,6 +362,7 @@ public static boolean hasContextJError(String domain, Set<Errors> errors) {
*/
protected String fromPunycode(String label, Set<Errors> errors) {
if (label.isEmpty()) {
// Impossible as long as this function is only called when label.startsWith("xn--").
errors.add(Errors.X3);
return label;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,27 +64,31 @@
# Implementations that allow values of particular input flags to be false would ignore
# the corresponding status codes listed in the table below when testing for errors.
#
# VerifyDnsLength: P4
# VerifyDnsLength: A4_1, A4_2
# CheckHyphens: V2, V3
# CheckJoiners: Cn
# CheckBidi: Bn
# UseSTD3ASCIIRules: U1
#
# Implementations that cannot work with ill-formed strings would skip test cases that contain them.
# For example, the status code A3 is set for a Punycode encoding error,
# which may be due to an unpaired surrogate.
#
# Implementations may be more strict than the default settings for UTS #46.
# In particular, an implementation conformant to IDNA2008 would disallow the input for lines
# marked with NV8.
# In particular, an implementation conformant to IDNA2008 would skip any line in this test file that
# contained a character in the toUnicode field that has the IDNA2008 Status value NV8 or XV8
# in IdnaMappingTable.txt.
# For example, it would skip a line containing ¢ (U+00A2 CENT SIGN) in the toUnicode field,
# because of the following line in IdnaMappingTable.txt:
#
# 00A1..00A7 ; valid ; ; NV8 # 1.1 INVERTED EXCLAMATION MARK..SECTION SIGN
#
# Implementations need only record that there is an error: they need not reproduce the
# precise status codes (after removing the ignored status values).
#
# Compatibility errors
#
# The special error codes X3 and X4_2 are now returned where a toASCII error code
# was formerly being generated in toUnicode due to an empty label.
#
# A3 was being generated in the following cases (in addition to its normal usage).
# • an empty label in toUnicode. In this case, it is replaced by X3.
#
# A4_2 was being generated in the following case (in addition to its normal usage).
# • an empty label in V8 (CheckBidi). In this case, it is being replaced by X4_2.
# The special error code X4_2 is now returned where a toASCII error code
# was formerly being generated in toUnicode due to an empty label:
# A4_2 was being generated for an empty label in CheckBidi (in addition to A4_2’s normal usage).
# ============================================================================================

0 comments on commit f65ed6e

Please sign in to comment.