From 616a9484cd1ad45308dce5e728eff44b5f928df6 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Thu, 18 Jan 2024 10:51:56 +0100 Subject: [PATCH 1/5] a failing test --- .../resources/org/unicode/text/UCD/UnicodeInvariantTest.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt index 34928fd9f..1a01c04bd 100644 --- a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt +++ b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt @@ -546,6 +546,12 @@ Let $nonAlphabeticDependentVowels = [\N{ORIYA SIGN OVERLINE}\N{THAI CHARACTER MA Let $nonDiacriticNuktas = [\u1BE6\U00010A38\U00010A39\U00010A3A\U0001133B] [\p{InSc=Nukta} - \p{Diacritic}] = $nonDiacriticNuktas +## Joining_Type and Joining_Group +# Where defined, the Joining_Group refines the Joining_Type. +EquivalencesOf \P{Joining_Group=No_Joining_Group} Joining_Group ⇒ Joining_Type +\p{gc=Mn} ⊆ \p{Joining_Type=Transparent} +\p{gc=Me} ⊆ \p{Joining_Type=Transparent} + ########################## # LineBreak property ########################## From d1d4f2408b5658c9ea524688c81f181a0061427b Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Thu, 18 Jan 2024 10:36:23 +0100 Subject: [PATCH 2/5] Other_Joining_Type --- .../org/unicode/props/ExtraPropertyAliases.txt | 3 +++ .../org/unicode/props/ExtraPropertyValueAliases.txt | 11 +++++++++-- .../org/unicode/props/IndexUnicodeProperties.txt | 5 ++++- 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/unicodetools/src/main/resources/org/unicode/props/ExtraPropertyAliases.txt b/unicodetools/src/main/resources/org/unicode/props/ExtraPropertyAliases.txt index 25bb25675..6f023eeba 100644 --- a/unicodetools/src/main/resources/org/unicode/props/ExtraPropertyAliases.txt +++ b/unicodetools/src/main/resources/org/unicode/props/ExtraPropertyAliases.txt @@ -27,6 +27,9 @@ idtype ; Identifier_Type idns ; Idn_Status idn8 ; Idn_2008 +# Unofficial contributory property used in the derivation of Joining_Type. +Other_Joining_Type ; Other_Joining_Type + # ================================================ # String Properties # ================================================ diff --git a/unicodetools/src/main/resources/org/unicode/props/ExtraPropertyValueAliases.txt b/unicodetools/src/main/resources/org/unicode/props/ExtraPropertyValueAliases.txt index f61c06c4a..beca905c4 100644 --- a/unicodetools/src/main/resources/org/unicode/props/ExtraPropertyValueAliases.txt +++ b/unicodetools/src/main/resources/org/unicode/props/ExtraPropertyValueAliases.txt @@ -96,7 +96,6 @@ # @missing: 0000..10FFFF; kTraditionalVariant ; # @missing: 0000..10FFFF; Joining_Group ; No_Joining_Group -# @missing: 0000..10FFFF; Joining_Type ; Non_Joining # Overrides for bugs @@ -124,7 +123,6 @@ idn8 ; na ; na # @missing: 0000..10FFFF; Idn_Mapping ; - # @missing: 0000..10FFFF; Identifier_Status ; r idstatus ; r ; Restricted @@ -162,3 +160,12 @@ sc ; Zxxx ; Unwritten # TODO: there is no Unicode 13.1, see https://github.com/unicode-org/unicodetools/issues/100 age; 13.1 ; V13_1 + +# @missing: 0000..10FFFF; Other_Joining_Type ; Deduce_From_General_Category +Other_Joining_Type ; C ; Join_Causing +Other_Joining_Type ; D ; Dual_Joining +Other_Joining_Type ; L ; Left_Joining +Other_Joining_Type ; R ; Right_Joining +Other_Joining_Type ; T ; Transparent +Other_Joining_Type ; U ; Non_Joining +Other_Joining_Type ; Deduce_From_General_Category ; Deduce_From_General_Category \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/props/IndexUnicodeProperties.txt b/unicodetools/src/main/resources/org/unicode/props/IndexUnicodeProperties.txt index 5e1b020aa..60b93500b 100644 --- a/unicodetools/src/main/resources/org/unicode/props/IndexUnicodeProperties.txt +++ b/unicodetools/src/main/resources/org/unicode/props/IndexUnicodeProperties.txt @@ -92,7 +92,10 @@ UnicodeData; Simple_Lowercase_Mapping ; 13 UnicodeData; Simple_Titlecase_Mapping ; 14 UnicodeData; Unicode_1_Name ; 10 UnicodeData; ISO_Comment ; 11 -ArabicShaping; Joining_Type; 2 +# Handle the complex default of ArabicShaping.txt by introducing an unofficial +# contributory property, to be used when deriving Joining_Type. +ArabicShaping; Other_Joining_Type; 2 +DerivedJoiningType; Joining_Type; 1 ArabicShaping; Joining_Group; 3 BidiMirroring; Bidi_Mirroring_Glyph; Blocks ; Block From 018541ffbbf27a529afb0b3edcd92fe6e9b69a8a Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Thu, 18 Jan 2024 10:36:57 +0100 Subject: [PATCH 3/5] GenerateEnums --- .../java/org/unicode/props/UcdProperty.java | 3 ++ .../org/unicode/props/UcdPropertyValues.java | 34 +++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/unicodetools/src/main/java/org/unicode/props/UcdProperty.java b/unicodetools/src/main/java/org/unicode/props/UcdProperty.java index 750beb165..cd60a6623 100644 --- a/unicodetools/src/main/java/org/unicode/props/UcdProperty.java +++ b/unicodetools/src/main/java/org/unicode/props/UcdProperty.java @@ -29,6 +29,7 @@ import org.unicode.props.UcdPropertyValues.NFKC_Quick_Check_Values; import org.unicode.props.UcdPropertyValues.NFKD_Quick_Check_Values; import org.unicode.props.UcdPropertyValues.Numeric_Type_Values; +import org.unicode.props.UcdPropertyValues.Other_Joining_Type_Values; import org.unicode.props.UcdPropertyValues.Script_Values; import org.unicode.props.UcdPropertyValues.Sentence_Break_Values; import org.unicode.props.UcdPropertyValues.Vertical_Orientation_Values; @@ -241,6 +242,8 @@ public enum UcdProperty { NFKC_Quick_Check(PropertyType.Enumerated, NFKC_Quick_Check_Values.class, null, "NFKC_QC"), NFKD_Quick_Check(PropertyType.Enumerated, NFKD_Quick_Check_Values.class, null, "NFKD_QC"), Numeric_Type(PropertyType.Enumerated, Numeric_Type_Values.class, null, "nt"), + Other_Joining_Type( + PropertyType.Enumerated, Other_Joining_Type_Values.class, null, "Other_Joining_Type"), Sentence_Break(PropertyType.Enumerated, Sentence_Break_Values.class, null, "SB"), Vertical_Orientation(PropertyType.Enumerated, Vertical_Orientation_Values.class, null, "vo"), Word_Break(PropertyType.Enumerated, Word_Break_Values.class, null, "WB"), diff --git a/unicodetools/src/main/java/org/unicode/props/UcdPropertyValues.java b/unicodetools/src/main/java/org/unicode/props/UcdPropertyValues.java index 64f818680..4307bf30f 100644 --- a/unicodetools/src/main/java/org/unicode/props/UcdPropertyValues.java +++ b/unicodetools/src/main/java/org/unicode/props/UcdPropertyValues.java @@ -1708,6 +1708,40 @@ public static Numeric_Type_Values forName(String name) { } // Numeric_Value + public enum Other_Joining_Type_Values implements Named { + Join_Causing("C"), + Dual_Joining("D"), + Left_Joining("L"), + Right_Joining("R"), + Transparent("T"), + Non_Joining("U"), + Deduce_From_General_Category("Deduce_From_General_Category"); + private final PropertyNames names; + + private Other_Joining_Type_Values(String shortName, String... otherNames) { + names = + new PropertyNames( + Other_Joining_Type_Values.class, this, shortName, otherNames); + } + + @Override + public PropertyNames getNames() { + return names; + } + + @Override + public String getShortName() { + return names.getShortName(); + } + + private static final NameMatcher NAME_MATCHER = + PropertyNames.getNameToEnums(Other_Joining_Type_Values.class); + + public static Other_Joining_Type_Values forName(String name) { + return NAME_MATCHER.get(name); + } + } + public enum Script_Values implements Named { Adlam("Adlm"), Caucasian_Albanian("Aghb"), From 578b55237e3fae09594333e75c0e5ae5a0cb6d2c Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Thu, 18 Jan 2024 11:04:50 +0100 Subject: [PATCH 4/5] Derivation --- .../resources/org/unicode/text/UCD/UnicodeInvariantTest.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt index 1a01c04bd..76ae34cb6 100644 --- a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt +++ b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt @@ -552,6 +552,11 @@ EquivalencesOf \P{Joining_Group=No_Joining_Group} Joining_Group ⇒ Joining_Type \p{gc=Mn} ⊆ \p{Joining_Type=Transparent} \p{gc=Me} ⊆ \p{Joining_Type=Transparent} +# Derivation of Joining_Type from the second column of ArabicShaping.txt (unofficially Other_Joining_Type). +In \P{Other_Joining_Type=Deduce_From_General_Category} Joining_Type = Other_Joining_Type +[ \p{Other_Joining_Type=Deduce_From_General_Category} & [\p{gc=Mn}\p{gc=Me}\p{gc=Cf}] ] ⊆ \p{Joining_Type=Transparent} +[ \p{Other_Joining_Type=Deduce_From_General_Category} - [\p{gc=Mn}\p{gc=Me}\p{gc=Cf}] ] ⊆ \p{Joining_Type=Non_Joining} + ########################## # LineBreak property ########################## From fcb1e597991ff15ee54dc8eca9b1a125eb120dbd Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Thu, 18 Jan 2024 12:08:18 +0100 Subject: [PATCH 5/5] Remove the Java test now that we have it in the invariants --- .../org/unicode/propstest/TestProperties.java | 29 ------------------- 1 file changed, 29 deletions(-) diff --git a/unicodetools/src/test/java/org/unicode/propstest/TestProperties.java b/unicodetools/src/test/java/org/unicode/propstest/TestProperties.java index 8f8a246c7..eecdd5219 100644 --- a/unicodetools/src/test/java/org/unicode/propstest/TestProperties.java +++ b/unicodetools/src/test/java/org/unicode/propstest/TestProperties.java @@ -214,35 +214,6 @@ public void TestAAScripts() { } } - @Test - public void TestJoiningGroupConsistency() { - // TODO(egg): I would like to be able to put that in the invariants tests as « the partition - // defined by Joining_Group is finer than that defined by Joining_Type ». - UnicodeMap joiningGroup = iup.load(UcdProperty.Joining_Group); - UnicodeMap joiningType = iup.load(UcdProperty.Joining_Type); - var charactersByJoiningGroup = new HashMap(); - joiningGroup.addInverseTo(charactersByJoiningGroup).remove("No_Joining_Group"); - charactersByJoiningGroup.forEach( - (group, set) -> { - final int first = set.getRangeStart(0); - final String firstType = joiningType.get(first); - set.forEach( - (c) -> { - assertEquals( - "U+" - + getCodeAndName(Character.toString(first)) - + "\nand\nU+" - + getCodeAndName(c) - + "\nhave different joining types but are in the" - + " same joining group (" - + group - + ")\n", - firstType, - joiningType.get(c)); - }); - }); - } - @Test public void TestScripts() {