Skip to content

Commit

Permalink
Fix the IndexUnicodeProperties Joining_Type (#657)
Browse files Browse the repository at this point in the history
  • Loading branch information
eggrobin authored Jan 20, 2024
1 parent bf38a00 commit fdc9c95
Show file tree
Hide file tree
Showing 7 changed files with 64 additions and 32 deletions.
3 changes: 3 additions & 0 deletions unicodetools/src/main/java/org/unicode/props/UcdProperty.java
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import org.unicode.props.UcdPropertyValues.NFKC_Quick_Check_Values;
import org.unicode.props.UcdPropertyValues.NFKD_Quick_Check_Values;
import org.unicode.props.UcdPropertyValues.Numeric_Type_Values;
import org.unicode.props.UcdPropertyValues.Other_Joining_Type_Values;
import org.unicode.props.UcdPropertyValues.Script_Values;
import org.unicode.props.UcdPropertyValues.Sentence_Break_Values;
import org.unicode.props.UcdPropertyValues.Vertical_Orientation_Values;
Expand Down Expand Up @@ -241,6 +242,8 @@ public enum UcdProperty {
NFKC_Quick_Check(PropertyType.Enumerated, NFKC_Quick_Check_Values.class, null, "NFKC_QC"),
NFKD_Quick_Check(PropertyType.Enumerated, NFKD_Quick_Check_Values.class, null, "NFKD_QC"),
Numeric_Type(PropertyType.Enumerated, Numeric_Type_Values.class, null, "nt"),
Other_Joining_Type(
PropertyType.Enumerated, Other_Joining_Type_Values.class, null, "Other_Joining_Type"),
Sentence_Break(PropertyType.Enumerated, Sentence_Break_Values.class, null, "SB"),
Vertical_Orientation(PropertyType.Enumerated, Vertical_Orientation_Values.class, null, "vo"),
Word_Break(PropertyType.Enumerated, Word_Break_Values.class, null, "WB"),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1708,6 +1708,40 @@ public static Numeric_Type_Values forName(String name) {
}

// Numeric_Value
public enum Other_Joining_Type_Values implements Named {
Join_Causing("C"),
Dual_Joining("D"),
Left_Joining("L"),
Right_Joining("R"),
Transparent("T"),
Non_Joining("U"),
Deduce_From_General_Category("Deduce_From_General_Category");
private final PropertyNames<Other_Joining_Type_Values> names;

private Other_Joining_Type_Values(String shortName, String... otherNames) {
names =
new PropertyNames<Other_Joining_Type_Values>(
Other_Joining_Type_Values.class, this, shortName, otherNames);
}

@Override
public PropertyNames<Other_Joining_Type_Values> getNames() {
return names;
}

@Override
public String getShortName() {
return names.getShortName();
}

private static final NameMatcher<Other_Joining_Type_Values> NAME_MATCHER =
PropertyNames.getNameToEnums(Other_Joining_Type_Values.class);

public static Other_Joining_Type_Values forName(String name) {
return NAME_MATCHER.get(name);
}
}

public enum Script_Values implements Named {
Adlam("Adlm"),
Caucasian_Albanian("Aghb"),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ idtype ; Identifier_Type
idns ; Idn_Status
idn8 ; Idn_2008

# Unofficial contributory property used in the derivation of Joining_Type.
Other_Joining_Type ; Other_Joining_Type

# ================================================
# String Properties
# ================================================
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,6 @@
# @missing: 0000..10FFFF; kTraditionalVariant ; <none>

# @missing: 0000..10FFFF; Joining_Group ; No_Joining_Group
# @missing: 0000..10FFFF; Joining_Type ; Non_Joining

# Overrides for bugs

Expand Down Expand Up @@ -124,7 +123,6 @@ idn8 ; na ; na

# @missing: 0000..10FFFF; Idn_Mapping ; <code point>


# @missing: 0000..10FFFF; Identifier_Status ; r

idstatus ; r ; Restricted
Expand Down Expand Up @@ -162,3 +160,12 @@ sc ; Zxxx ; Unwritten

# TODO: there is no Unicode 13.1, see https://github.com/unicode-org/unicodetools/issues/100
age; 13.1 ; V13_1

# @missing: 0000..10FFFF; Other_Joining_Type ; Deduce_From_General_Category
Other_Joining_Type ; C ; Join_Causing
Other_Joining_Type ; D ; Dual_Joining
Other_Joining_Type ; L ; Left_Joining
Other_Joining_Type ; R ; Right_Joining
Other_Joining_Type ; T ; Transparent
Other_Joining_Type ; U ; Non_Joining
Other_Joining_Type ; Deduce_From_General_Category ; Deduce_From_General_Category
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,10 @@ UnicodeData; Simple_Lowercase_Mapping ; 13
UnicodeData; Simple_Titlecase_Mapping ; 14
UnicodeData; Unicode_1_Name ; 10
UnicodeData; ISO_Comment ; 11
ArabicShaping; Joining_Type; 2
# Handle the complex default of ArabicShaping.txt by introducing an unofficial
# contributory property, to be used when deriving Joining_Type.
ArabicShaping; Other_Joining_Type; 2
DerivedJoiningType; Joining_Type; 1
ArabicShaping; Joining_Group; 3
BidiMirroring; Bidi_Mirroring_Glyph;
Blocks ; Block
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -546,6 +546,17 @@ Let $nonAlphabeticDependentVowels = [\N{ORIYA SIGN OVERLINE}\N{THAI CHARACTER MA
Let $nonDiacriticNuktas = [\u1BE6\U00010A38\U00010A39\U00010A3A\U0001133B]
[\p{InSc=Nukta} - \p{Diacritic}] = $nonDiacriticNuktas

## Joining_Type and Joining_Group
# Where defined, the Joining_Group refines the Joining_Type.
EquivalencesOf \P{Joining_Group=No_Joining_Group} Joining_Group ⇒ Joining_Type
\p{gc=Mn} ⊆ \p{Joining_Type=Transparent}
\p{gc=Me} ⊆ \p{Joining_Type=Transparent}

# Derivation of Joining_Type from the second column of ArabicShaping.txt (unofficially Other_Joining_Type).
In \P{Other_Joining_Type=Deduce_From_General_Category} Joining_Type = Other_Joining_Type
[ \p{Other_Joining_Type=Deduce_From_General_Category} & [\p{gc=Mn}\p{gc=Me}\p{gc=Cf}] ] ⊆ \p{Joining_Type=Transparent}
[ \p{Other_Joining_Type=Deduce_From_General_Category} - [\p{gc=Mn}\p{gc=Me}\p{gc=Cf}] ] ⊆ \p{Joining_Type=Non_Joining}

##########################
# LineBreak property
##########################
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -214,35 +214,6 @@ public void TestAAScripts() {
}
}

@Test
public void TestJoiningGroupConsistency() {
// TODO(egg): I would like to be able to put that in the invariants tests as « the partition
// defined by Joining_Group is finer than that defined by Joining_Type ».
UnicodeMap<String> joiningGroup = iup.load(UcdProperty.Joining_Group);
UnicodeMap<String> joiningType = iup.load(UcdProperty.Joining_Type);
var charactersByJoiningGroup = new HashMap<String, UnicodeSet>();
joiningGroup.addInverseTo(charactersByJoiningGroup).remove("No_Joining_Group");
charactersByJoiningGroup.forEach(
(group, set) -> {
final int first = set.getRangeStart(0);
final String firstType = joiningType.get(first);
set.forEach(
(c) -> {
assertEquals(
"U+"
+ getCodeAndName(Character.toString(first))
+ "\nand\nU+"
+ getCodeAndName(c)
+ "\nhave different joining types but are in the"
+ " same joining group ("
+ group
+ ")\n",
firstType,
joiningType.get(c));
});
});
}

@Test
public void TestScripts() {

Expand Down

0 comments on commit fdc9c95

Please sign in to comment.