From f04a3ad64643b9ffa687b2f3a6cac7429b1782f8 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 13 Mar 2024 16:41:15 +0100 Subject: [PATCH 1/6] An invariant about Lowercase --- .../resources/org/unicode/text/UCD/UnicodeInvariantTest.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt index 69e23177d..3dcce4e73 100644 --- a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt +++ b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt @@ -586,6 +586,8 @@ Let $nonAlphabeticAvagrahas = [\N{TIBETAN MARK PALUTA}] # A punctuation mark. [\p{InSC=Avagraha} - $nonAlphabeticAvagrahas] ⊆ \p{Alphabetic} # Name-based checks. +Let $nonLowercaseSmallLetters = [ \p{name=/^LIMBU SMALL LETTER/} \N{TURNED GREEK SMALL LETTER IOTA} \p{name=/^(SQUARED|PARENTHESIZED|TAG) LATIN SMALL LETTER/} ] +[ \p{name=/\bSMALL LETTER\b}-\p{gc=Mn}-\p{gc=Lt} - $nonLowercaseSmallLetters ] ⊆ \p{Lowercase} # Combining letters are often alphabetic (medievalist abbreviations). # The others are diacritic (cantillation marks, phonetics). From 22b578c300c1903c445d4b6b323800362007f5b0 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 13 Mar 2024 17:51:46 +0100 Subject: [PATCH 2/6] Test the modifier letters too --- .../resources/org/unicode/text/UCD/UnicodeInvariantTest.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt index 3dcce4e73..e8e82ad78 100644 --- a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt +++ b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt @@ -587,7 +587,9 @@ Let $nonAlphabeticAvagrahas = [\N{TIBETAN MARK PALUTA}] # A punctuation mark. # Name-based checks. Let $nonLowercaseSmallLetters = [ \p{name=/^LIMBU SMALL LETTER/} \N{TURNED GREEK SMALL LETTER IOTA} \p{name=/^(SQUARED|PARENTHESIZED|TAG) LATIN SMALL LETTER/} ] -[ \p{name=/\bSMALL LETTER\b}-\p{gc=Mn}-\p{gc=Lt} - $nonLowercaseSmallLetters ] ⊆ \p{Lowercase} +Let $nonLowercaseSmallModifierLetters = [ \p{gc=Lm} & \p{name=/^ARABIC SMALL/} ] +[ \p{name=/\bSMALL LETTER\b/}-\p{gc=Mn}-\p{gc=Lt} - $nonLowercaseSmallLetters ] ⊆ \p{Lowercase} +[ [\p{gc=Lm} & \p{name=/SMALL/}] - $nonLowercaseSmallModifierLetters ] ⊆ \p{Lowercase} # Combining letters are often alphabetic (medievalist abbreviations). # The others are diacritic (cantillation marks, phonetics). From 438ecd0406960367e06ac90b76d138612ee62131 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 13 Mar 2024 18:13:56 +0100 Subject: [PATCH 3/6] Use the decompositions --- .../org/unicode/text/UCD/UnicodeInvariantTest.txt | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt index e8e82ad78..4fa139b32 100644 --- a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt +++ b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt @@ -596,6 +596,14 @@ Let $nonLowercaseSmallModifierLetters = [ \p{gc=Lm} & \p{name=/^ARABIC SMALL/} ] # See 177-C52. \p{name=/COMBINING .* LETTER/} ⊆ [\p{Alphabetic}\p{Diacritic}] +## Consistency of Lowercase with decompositions. +# Note that the same is not true of Uppercase. +# A non-lowercase character has non-lowercase characters in its decomposition, +# or it its decomposition is (㋍ etc.). +In [\P{Lowercase} - \p{dt=square}], \p{Lowercase} * toNFKD ≠ toNFKD +# A lowercase character decomposes to lowercase characters and nonspacing marks. +In \p{Lowercase}, [\p{Lowercase}\p{Mn}] * toNFKD = toNFKD + ## Joining_Type and Joining_Group # Where defined, the Joining_Group refines the Joining_Type. OnPairsOf \P{Joining_Group=No_Joining_Group}, EqualityOf Joining_Group ⇒ EqualityOf Joining_Type From 1e1d40ede3ff9ef843bfa310ab81dfafb559686a Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 13 Mar 2024 18:22:45 +0100 Subject: [PATCH 4/6] bad parser --- .../resources/org/unicode/text/UCD/UnicodeInvariantTest.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt index 4fa139b32..f5377a383 100644 --- a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt +++ b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt @@ -602,7 +602,7 @@ Let $nonLowercaseSmallModifierLetters = [ \p{gc=Lm} & \p{name=/^ARABIC SMALL/} ] # or it its decomposition is (㋍ etc.). In [\P{Lowercase} - \p{dt=square}], \p{Lowercase} * toNFKD ≠ toNFKD # A lowercase character decomposes to lowercase characters and nonspacing marks. -In \p{Lowercase}, [\p{Lowercase}\p{Mn}] * toNFKD = toNFKD +In \p{Lowercase}, [\p{Lowercase}\p{gc=Mn}] * toNFKD = toNFKD ## Joining_Type and Joining_Group # Where defined, the Joining_Group refines the Joining_Type. From 259c90f1f18adc04c254d2af5225f5a852fb287f Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 13 Mar 2024 18:30:46 +0100 Subject: [PATCH 5/6] that is just wrong. --- .../resources/org/unicode/text/UCD/UnicodeInvariantTest.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt index f5377a383..c3a93fa5b 100644 --- a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt +++ b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt @@ -601,8 +601,6 @@ Let $nonLowercaseSmallModifierLetters = [ \p{gc=Lm} & \p{name=/^ARABIC SMALL/} ] # A non-lowercase character has non-lowercase characters in its decomposition, # or it its decomposition is (㋍ etc.). In [\P{Lowercase} - \p{dt=square}], \p{Lowercase} * toNFKD ≠ toNFKD -# A lowercase character decomposes to lowercase characters and nonspacing marks. -In \p{Lowercase}, [\p{Lowercase}\p{gc=Mn}] * toNFKD = toNFKD ## Joining_Type and Joining_Group # Where defined, the Joining_Group refines the Joining_Type. From 2846cee80c88a6a43c18f991afd0fa3bfecf01c0 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 13 Mar 2024 20:27:21 +0100 Subject: [PATCH 6/6] Tippfehler Co-authored-by: Markus Scherer --- .../resources/org/unicode/text/UCD/UnicodeInvariantTest.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt index c3a93fa5b..485a35d5d 100644 --- a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt +++ b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt @@ -599,7 +599,7 @@ Let $nonLowercaseSmallModifierLetters = [ \p{gc=Lm} & \p{name=/^ARABIC SMALL/} ] ## Consistency of Lowercase with decompositions. # Note that the same is not true of Uppercase. # A non-lowercase character has non-lowercase characters in its decomposition, -# or it its decomposition is (㋍ etc.). +# or its decomposition is (㋍ etc.). In [\P{Lowercase} - \p{dt=square}], \p{Lowercase} * toNFKD ≠ toNFKD ## Joining_Type and Joining_Group