Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

UTS46 replace disallowed_STD3_ with check in Validity Criteria #874

Merged
merged 2 commits into from
Aug 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions UnicodeJsps/src/test/java/org/unicode/jsptest/TestJsp.java
Original file line number Diff line number Diff line change
Expand Up @@ -929,7 +929,7 @@ public void TestIdna() {
checkValues(error, Uts46.SINGLETON);
checkValidIdna(Uts46.SINGLETON, "À。÷");
checkValidIdna(Uts46.SINGLETON, "≠"); // valid since Unicode 15.1
checkInvalidIdna(Uts46.SINGLETON, "\u0001");
checkInvalidIdna(Uts46.SINGLETON, "\u0080");
checkToUnicode(Uts46.SINGLETON, "ß。ab", "ß.ab");
// checkToPunyCode(Uts46.SINGLETON, "\u0002", "xn---");
checkToPunyCode(Uts46.SINGLETON, "ß。ab", "ss.ab");
Expand Down Expand Up @@ -973,7 +973,8 @@ public void TestIdna() {
private void checkValues(boolean[] error, Idna idna) {
checkToUnicodeAndPunyCode(idna, "α.xn--mxa", "α.α", "xn--mxa.xn--mxa");
checkValidIdna(idna, "a");
checkInvalidIdna(idna, "=");
// 33C2 ; disallowed # 1.1 SQUARE AM
checkInvalidIdna(idna, "㏂");
}

private void checkToUnicodeAndPunyCode(
Expand Down
18,372 changes: 9,186 additions & 9,186 deletions unicodetools/data/idna/dev/IdnaMappingTable.txt

Large diffs are not rendered by default.

159 changes: 86 additions & 73 deletions unicodetools/data/idna/dev/IdnaTestV2.txt

Large diffs are not rendered by default.

41 changes: 13 additions & 28 deletions unicodetools/src/main/java/org/unicode/idna/GenerateIdna.java
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,10 @@ public class GenerateIdna {
Settings.Output.GEN_DIR + "idna/" + Default.ucdVersion() + "/";

// Utility.WORKSPACE_DIRECTORY + "draft/reports/tr46/data";
private static final int MAX_STATUS_LENGTH = "disallowed_STD3_mapped".length();
private static final int MAX_STATUS_LENGTH = "disallowed".length();
public static UnicodeSet U32;
public static UnicodeSet U40;
public static UnicodeSet VALID_ASCII;
public static UnicodeSet NSTD3_ASCII;
public static UnicodeSet ASCII_EXCEPT_DOT;
static ToolUnicodePropertySource properties;
static UnicodeSet cn;
static UnicodeSet bidiControls;
Expand Down Expand Up @@ -74,14 +73,12 @@ public static void main(String[] args) throws IOException {
}

U32 = new UnicodeSet("[:age=3.2:]").freeze();
VALID_ASCII = new UnicodeSet("[\\u002Da-zA-Z0-9]").freeze();
NSTD3_ASCII = new UnicodeSet("[[\\u0000-\\u007F]-[.]]").freeze();
ASCII_EXCEPT_DOT = new UnicodeSet("[[\\u0000-\\u007F]-[.]]").freeze();
properties = ToolUnicodePropertySource.make(Default.ucdVersion());
cn = properties.getSet("gc=Cn").freeze();
bidiControls = properties.getSet("bidi_control=true");

final UnicodeMap<Row.R2<IdnaType, String>> mappingTable = createMappingTable(true);
final UnicodeMap<Row.R2<IdnaType, String>> mappingTableNSTD3 = createMappingTable(false);
final UnicodeMap<Row.R2<IdnaType, String>> mappingTable = createMappingTable();
{
final UnicodeMap<String> mappings = new UnicodeMap<String>();
final UnicodeMap<IdnaType> types = new UnicodeMap<IdnaType>();
Expand All @@ -101,8 +98,7 @@ public static void main(String[] args) throws IOException {

for (int cp = 0; cp <= 0x10FFFF; ++cp) {
final Row.R2<IdnaType, String> value = mappingTable.get(cp);
final Row.R2<IdnaType, String> valueNstd3 = mappingTableNSTD3.get(cp);
if (value == null || valueNstd3 == null) {
if (value == null) {
throw new IllegalArgumentException("Expected value for " + Utility.hex(cp));
}
final IdnaType status = value.get0();
Expand All @@ -116,21 +112,13 @@ public static void main(String[] args) throws IOException {
: status);
}

final IdnaType statusNstd3 = valueNstd3.get0();
String endStatus =
statusNstd3 == status ? status.toString() : status + "_STD3_" + statusNstd3;
String endStatus = status.toString();
final String mapping = value.get1();
final String mappingNstd3 = valueNstd3.get1();
// if mapped, add info
if (status == IdnaType.mapped
|| status == IdnaType.deviation
|| statusNstd3 == IdnaType.mapped
|| statusNstd3 == IdnaType.deviation) {
if (status == IdnaType.mapped || status == IdnaType.deviation) {
endStatus += Utility.repeat(" ", MAX_STATUS_LENGTH - endStatus.length()) + " ; ";
if (mapping != null && mapping.length() != 0) {
endStatus += Utility.hex(mapping);
} else if (mappingNstd3 != null && mappingNstd3.length() != 0) {
endStatus += Utility.hex(mappingNstd3);
}
} else {
if (mapping != null) {
Expand Down Expand Up @@ -316,8 +304,7 @@ private static boolean equals(Object a, Object b) {
return a.equals(b);
}

private static UnicodeMap<Row.R2<IdnaType, String>> createMappingTable(boolean STD3) {

private static UnicodeMap<Row.R2<IdnaType, String>> createMappingTable() {
final UnicodeMap<String> nfkc_cfMap = properties.getProperty("NFKC_CF").getUnicodeMap();
final UnicodeMap<String> baseMapping = new UnicodeMap<String>().putAll(nfkc_cfMap);
baseMapping.put(0xFF0E, "\u002E");
Expand Down Expand Up @@ -356,12 +343,11 @@ private static UnicodeMap<Row.R2<IdnaType, String>> createMappingTable(boolean S
// would not be valid, and capital sharp s would end up disallowed.
.add(0x00DF)
// .addAll(0x200c, 0x200d)
.addAll(STD3 ? VALID_ASCII : NSTD3_ASCII)
.addAll(ASCII_EXCEPT_DOT)
.freeze();

System.out.println(
STD3
+ " Base Valid Set & nfkcqc=n"
"Base Valid Set & nfkcqc=n"
+ new UnicodeSet("[:nfkcqc=n:]").retainAll(baseValidSet));

// https://unicode.org/reports/tr46/#TableDerivationStep3
Expand All @@ -372,10 +358,9 @@ private static UnicodeMap<Row.R2<IdnaType, String>> createMappingTable(boolean S
// U+E00xx tag characters
final UnicodeSet baseExclusionSet = new UnicodeSet(0xFFFC, 0xFFFD, 0xE0001, 0xE007F);

System.out.println(STD3 + " base valid set:\t" + baseValidSet);
System.out.println("base valid set:\t" + baseValidSet);
System.out.println(
STD3
+ " ***Overlap with baseValidSet and baseExclusionSet:\t"
"***Overlap with baseValidSet and baseExclusionSet:\t"
+ new UnicodeSet(baseValidSet).retainAll(baseExclusionSet));

final UnicodeSet deviationSet =
Expand Down Expand Up @@ -475,7 +460,7 @@ private static UnicodeMap<Row.R2<IdnaType, String>> createMappingTable(boolean S
}
}
mappingTable.putAll(excluded, disallowedResult);
System.out.println(STD3 + " ***Step 7 Invalid Exclusion: " + excluded);
System.out.println("***Step 7 Invalid Exclusion: " + excluded);
} while (excluded.size() != 0);

// detect errors, where invalid character doesn't have at least one invalid in decomposition
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -974,5 +974,10 @@ public static UnicodeSet getIdna2008Valid() {
// IdnaTestV2.txt missed a bug in the UTS 46 implementation that I'm writing due to
// not testing an upper-case letter in the ASCII part of Punycode when there are no errors.
"xn--A-1ga",
// https://www.unicode.org/L2/L2024/24063-pubrev.html#ID20240402104744 / PAG issue #282:
// Subtle behavior change for UseSTD3ASCIIRules=true
// due to simplified checking only in Validity Criteria, after Map+Normalize.
// fullwidth equals + combining solidus overlay
"\uFF1D\u0338",
};
}
16 changes: 8 additions & 8 deletions unicodetools/src/main/java/org/unicode/idna/Uts46.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@ public class Uts46 extends Idna {

private final boolean isUnicode15OrEarlier;

private UnicodeSet disallowedSTD3 = new UnicodeSet();
/** ASCII characters other than lowercase ASCII letters, digits, and hyphen-minus. */
private UnicodeSet disallowedSTD3 =
new UnicodeSet(0, 0x7f).remove('-').remove('0', '9').remove('a', 'z').freeze();

private Uts46() {
String path = Settings.UnicodeTools.getDataPathStringForLatestVersion("idna");
Expand All @@ -43,11 +45,6 @@ class MyHandler extends FileUtilities.SemiFileReader {
@Override
public boolean handleLine(int start, int end, String[] items) {
String status = items[1];
final int dash = status.indexOf("_STD3");
if (dash >= 0) {
disallowedSTD3.add(start, end);
status = status.substring(0, dash);
}
final IdnaType type = IdnaType.valueOf(status);
types.putAll(start, end, type);

Expand Down Expand Up @@ -455,7 +452,7 @@ public enum Errors {
// [IDNA2008] RFC 5893, Section 2.
// --> see Cn errors

/** U1 for UseSTD3ASCIIRules: Replaces V7 for disallowed_STD3_*. */
/** U1 for UseSTD3ASCIIRules: Replaces V7 for ASCII other than lowercase LDH. */
U1(0),

A3(UIDNA_ERROR_PUNYCODE),
Expand Down Expand Up @@ -689,14 +686,17 @@ private void checkLabelValidity(String label, IdnaChoice idnaChoice, Set<Errors>
// or deviation.
switch (type) {
case valid:
if (disallowedSTD3.contains(cp)) {
errors.add(Errors.U1);
}
break;
case deviation:
if (idnaChoice == IdnaChoice.transitional) {
errors.add(Errors.V7);
}
break;
default:
errors.add(disallowedSTD3.contains(cp) ? Errors.U1 : Errors.V7);
errors.add(Errors.V7);
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,26 @@ public void CheckProps(
boolean same = true;
for (String value : values) {
UnicodeSet oldSet = oldStatus.getSet(value);
if (ucdProperty == UcdProperty.Idn_Status && Settings.latestVersion.equals("16.0.0")) {
// Until Unicode 15.1, we had conditional Status values
// disallowed_STD3_valid and disallowed_STD3_mapped.
// At runtime, if UseSTD3ASCIIRules=true, they resolved to disallowed;
// if UseSTD3ASCIIRules=false, they resolved to valid or mapped, respectively.
// Unicode 16 replaces them with valid/mapped and handles UseSTD3ASCIIRules=true
// while checking the Validity Criteria.
switch (value) {
case "disallowed_STD3_valid":
case "disallowed_STD3_mapped":
continue;
case "valid":
case "mapped":
UnicodeSet disallowedSTD3 = oldStatus.getSet("disallowed_STD3_" + value);
oldSet.addAll(disallowedSTD3);
break;
default:
break;
}
}
UnicodeSet newSet = currentStatus.getSet(value);
same &= oldSet.equals(newSet);
if (!newSet.containsAll(oldSet)) {
Expand Down
11 changes: 11 additions & 0 deletions unicodetools/src/test/java/org/unicode/unittest/TestIdnaTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,17 @@ public void testBackwardsCompatibility() {
}
{
Idn_Status_Values lastStatus = idnaStatusLast.get(x);
// Until Unicode 15.1, we had conditional Status values
// disallowed_STD3_valid and disallowed_STD3_mapped.
// At runtime, if UseSTD3ASCIIRules=true, they resolved to disallowed;
// if UseSTD3ASCIIRules=false, they resolved to valid or mapped, respectively.
// Unicode 16 replaces them with valid/mapped and handles UseSTD3ASCIIRules=true
// while checking the Validity Criteria.
if (lastStatus == Idn_Status_Values.disallowed_STD3_valid) {
lastStatus = Idn_Status_Values.valid;
} else if (lastStatus == Idn_Status_Values.disallowed_STD3_mapped) {
lastStatus = Idn_Status_Values.mapped;
}
Idn_Status_Values currentStatus = idnaStatus.get(x);
boolean skip = changingIn16.contains(c0);
if (!skip) {
Expand Down
Loading