Skip to content

Commit

Permalink
UTS46 replace disallowed_STD3_ with check in Validity Criteria
Browse files Browse the repository at this point in the history
  • Loading branch information
markusicu committed Jul 3, 2024
1 parent 3ec5c0c commit ec3ff84
Show file tree
Hide file tree
Showing 6 changed files with 9,321 additions and 9,297 deletions.
18,372 changes: 9,186 additions & 9,186 deletions unicodetools/data/idna/dev/IdnaMappingTable.txt

Large diffs are not rendered by default.

158 changes: 83 additions & 75 deletions unicodetools/data/idna/dev/IdnaTestV2.txt

Large diffs are not rendered by default.

41 changes: 13 additions & 28 deletions unicodetools/src/main/java/org/unicode/idna/GenerateIdna.java
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,10 @@ public class GenerateIdna {
Settings.Output.GEN_DIR + "idna/" + Default.ucdVersion() + "/";

// Utility.WORKSPACE_DIRECTORY + "draft/reports/tr46/data";
private static final int MAX_STATUS_LENGTH = "disallowed_STD3_mapped".length();
private static final int MAX_STATUS_LENGTH = "disallowed".length();
public static UnicodeSet U32;
public static UnicodeSet U40;
public static UnicodeSet VALID_ASCII;
public static UnicodeSet NSTD3_ASCII;
public static UnicodeSet ASCII_EXCEPT_DOT;
static ToolUnicodePropertySource properties;
static UnicodeSet cn;
static UnicodeSet bidiControls;
Expand Down Expand Up @@ -74,14 +73,12 @@ public static void main(String[] args) throws IOException {
}

U32 = new UnicodeSet("[:age=3.2:]").freeze();
VALID_ASCII = new UnicodeSet("[\\u002Da-zA-Z0-9]").freeze();
NSTD3_ASCII = new UnicodeSet("[[\\u0000-\\u007F]-[.]]").freeze();
ASCII_EXCEPT_DOT = new UnicodeSet("[[\\u0000-\\u007F]-[.]]").freeze();
properties = ToolUnicodePropertySource.make(Default.ucdVersion());
cn = properties.getSet("gc=Cn").freeze();
bidiControls = properties.getSet("bidi_control=true");

final UnicodeMap<Row.R2<IdnaType, String>> mappingTable = createMappingTable(true);
final UnicodeMap<Row.R2<IdnaType, String>> mappingTableNSTD3 = createMappingTable(false);
final UnicodeMap<Row.R2<IdnaType, String>> mappingTable = createMappingTable();
{
final UnicodeMap<String> mappings = new UnicodeMap<String>();
final UnicodeMap<IdnaType> types = new UnicodeMap<IdnaType>();
Expand All @@ -101,8 +98,7 @@ public static void main(String[] args) throws IOException {

for (int cp = 0; cp <= 0x10FFFF; ++cp) {
final Row.R2<IdnaType, String> value = mappingTable.get(cp);
final Row.R2<IdnaType, String> valueNstd3 = mappingTableNSTD3.get(cp);
if (value == null || valueNstd3 == null) {
if (value == null) {
throw new IllegalArgumentException("Expected value for " + Utility.hex(cp));
}
final IdnaType status = value.get0();
Expand All @@ -116,21 +112,13 @@ public static void main(String[] args) throws IOException {
: status);
}

final IdnaType statusNstd3 = valueNstd3.get0();
String endStatus =
statusNstd3 == status ? status.toString() : status + "_STD3_" + statusNstd3;
String endStatus = status.toString();
final String mapping = value.get1();
final String mappingNstd3 = valueNstd3.get1();
// if mapped, add info
if (status == IdnaType.mapped
|| status == IdnaType.deviation
|| statusNstd3 == IdnaType.mapped
|| statusNstd3 == IdnaType.deviation) {
if (status == IdnaType.mapped || status == IdnaType.deviation) {
endStatus += Utility.repeat(" ", MAX_STATUS_LENGTH - endStatus.length()) + " ; ";
if (mapping != null && mapping.length() != 0) {
endStatus += Utility.hex(mapping);
} else if (mappingNstd3 != null && mappingNstd3.length() != 0) {
endStatus += Utility.hex(mappingNstd3);
}
} else {
if (mapping != null) {
Expand Down Expand Up @@ -316,8 +304,7 @@ private static boolean equals(Object a, Object b) {
return a.equals(b);
}

private static UnicodeMap<Row.R2<IdnaType, String>> createMappingTable(boolean STD3) {

private static UnicodeMap<Row.R2<IdnaType, String>> createMappingTable() {
final UnicodeMap<String> nfkc_cfMap = properties.getProperty("NFKC_CF").getUnicodeMap();
final UnicodeMap<String> baseMapping = new UnicodeMap<String>().putAll(nfkc_cfMap);
baseMapping.put(0xFF0E, "\u002E");
Expand Down Expand Up @@ -356,12 +343,11 @@ private static UnicodeMap<Row.R2<IdnaType, String>> createMappingTable(boolean S
// would not be valid, and capital sharp s would end up disallowed.
.add(0x00DF)
// .addAll(0x200c, 0x200d)
.addAll(STD3 ? VALID_ASCII : NSTD3_ASCII)
.addAll(ASCII_EXCEPT_DOT)
.freeze();

System.out.println(
STD3
+ " Base Valid Set & nfkcqc=n"
"Base Valid Set & nfkcqc=n"
+ new UnicodeSet("[:nfkcqc=n:]").retainAll(baseValidSet));

// https://unicode.org/reports/tr46/#TableDerivationStep3
Expand All @@ -372,10 +358,9 @@ private static UnicodeMap<Row.R2<IdnaType, String>> createMappingTable(boolean S
// U+E00xx tag characters
final UnicodeSet baseExclusionSet = new UnicodeSet(0xFFFC, 0xFFFD, 0xE0001, 0xE007F);

System.out.println(STD3 + " base valid set:\t" + baseValidSet);
System.out.println("base valid set:\t" + baseValidSet);
System.out.println(
STD3
+ " ***Overlap with baseValidSet and baseExclusionSet:\t"
"***Overlap with baseValidSet and baseExclusionSet:\t"
+ new UnicodeSet(baseValidSet).retainAll(baseExclusionSet));

final UnicodeSet deviationSet =
Expand Down Expand Up @@ -475,7 +460,7 @@ private static UnicodeMap<Row.R2<IdnaType, String>> createMappingTable(boolean S
}
}
mappingTable.putAll(excluded, disallowedResult);
System.out.println(STD3 + " ***Step 7 Invalid Exclusion: " + excluded);
System.out.println("***Step 7 Invalid Exclusion: " + excluded);
} while (excluded.size() != 0);

// detect errors, where invalid character doesn't have at least one invalid in decomposition
Expand Down
16 changes: 8 additions & 8 deletions unicodetools/src/main/java/org/unicode/idna/Uts46.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@ public class Uts46 extends Idna {

private final boolean isUnicode15OrEarlier;

private UnicodeSet disallowedSTD3 = new UnicodeSet();
/** ASCII characters other than lowercase ASCII letters, digits, and hyphen-minus. */
private UnicodeSet disallowedSTD3 =
new UnicodeSet(0, 0x7f).remove('-').remove('0', '9').remove('a', 'z').freeze();

private Uts46() {
String path = Settings.UnicodeTools.getDataPathStringForLatestVersion("idna");
Expand All @@ -43,11 +45,6 @@ class MyHandler extends FileUtilities.SemiFileReader {
@Override
public boolean handleLine(int start, int end, String[] items) {
String status = items[1];
final int dash = status.indexOf("_STD3");
if (dash >= 0) {
disallowedSTD3.add(start, end);
status = status.substring(0, dash);
}
final IdnaType type = IdnaType.valueOf(status);
types.putAll(start, end, type);

Expand Down Expand Up @@ -455,7 +452,7 @@ public enum Errors {
// [IDNA2008] RFC 5893, Section 2.
// --> see Cn errors

/** U1 for UseSTD3ASCIIRules: Replaces V7 for disallowed_STD3_*. */
/** U1 for UseSTD3ASCIIRules: Replaces V7 for ASCII other than lowercase LDH. */
U1(0),

A3(UIDNA_ERROR_PUNYCODE),
Expand Down Expand Up @@ -689,14 +686,17 @@ private void checkLabelValidity(String label, IdnaChoice idnaChoice, Set<Errors>
// or deviation.
switch (type) {
case valid:
if (disallowedSTD3.contains(cp)) {
errors.add(Errors.U1);
}
break;
case deviation:
if (idnaChoice == IdnaChoice.transitional) {
errors.add(Errors.V7);
}
break;
default:
errors.add(disallowedSTD3.contains(cp) ? Errors.U1 : Errors.V7);
errors.add(Errors.V7);
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,26 @@ public void CheckProps(
boolean same = true;
for (String value : values) {
UnicodeSet oldSet = oldStatus.getSet(value);
if (ucdProperty == UcdProperty.Idn_Status && Settings.latestVersion.equals("16.0.0")) {
// Until Unicode 15.1, we had conditional Status values
// disallowed_STD3_valid and disallowed_STD3_mapped.
// At runtime, if UseSTD3ASCIIRules=true, they resolved to disallowed;
// if UseSTD3ASCIIRules=false, they resolved to valid or mapped, respectively.
// Unicode 16 replaces them with valid/mapped and handles UseSTD3ASCIIRules=true
// while checking the Validity Criteria.
switch (value) {
case "disallowed_STD3_valid":
case "disallowed_STD3_mapped":
continue;
case "valid":
case "mapped":
UnicodeSet disallowedSTD3 = oldStatus.getSet("disallowed_STD3_" + value);
oldSet.addAll(disallowedSTD3);
break;
default:
break;
}
}
UnicodeSet newSet = currentStatus.getSet(value);
same &= oldSet.equals(newSet);
if (!newSet.containsAll(oldSet)) {
Expand Down
11 changes: 11 additions & 0 deletions unicodetools/src/test/java/org/unicode/unittest/TestIdnaTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,17 @@ public void testBackwardsCompatibility() {
}
{
Idn_Status_Values lastStatus = idnaStatusLast.get(x);
// Until Unicode 15.1, we had conditional Status values
// disallowed_STD3_valid and disallowed_STD3_mapped.
// At runtime, if UseSTD3ASCIIRules=true, they resolved to disallowed;
// if UseSTD3ASCIIRules=false, they resolved to valid or mapped, respectively.
// Unicode 16 replaces them with valid/mapped and handles UseSTD3ASCIIRules=true
// while checking the Validity Criteria.
if (lastStatus == Idn_Status_Values.disallowed_STD3_valid) {
lastStatus = Idn_Status_Values.valid;
} else if (lastStatus == Idn_Status_Values.disallowed_STD3_mapped) {
lastStatus = Idn_Status_Values.mapped;
}
Idn_Status_Values currentStatus = idnaStatus.get(x);
boolean skip = changingIn16.contains(c0);
if (!skip) {
Expand Down

0 comments on commit ec3ff84

Please sign in to comment.