diff --git a/unicodetools/src/main/java/org/unicode/props/PropertyParsingInfo.java b/unicodetools/src/main/java/org/unicode/props/PropertyParsingInfo.java index e64326853..0e8036638 100644 --- a/unicodetools/src/main/java/org/unicode/props/PropertyParsingInfo.java +++ b/unicodetools/src/main/java/org/unicode/props/PropertyParsingInfo.java @@ -231,7 +231,10 @@ public void put( if (value != null && value.isEmpty() && property != UcdProperty.NFKC_Casefold - && property != UcdProperty.NFKC_Simple_Casefold) { + && property != UcdProperty.NFKC_Simple_Casefold + && property != UcdProperty.Jamo_Short_Name) { + // TODO(egg): We probably should do this only exceptionally for UnicodeData.txt, + // instead of by default for all but the few properties above. value = null; } value = normalizeAndVerify(value); diff --git a/unicodetools/src/main/java/org/unicode/props/ShimUnicodePropertyFactory.java b/unicodetools/src/main/java/org/unicode/props/ShimUnicodePropertyFactory.java index d51ba46c9..86eb5c2db 100644 --- a/unicodetools/src/main/java/org/unicode/props/ShimUnicodePropertyFactory.java +++ b/unicodetools/src/main/java/org/unicode/props/ShimUnicodePropertyFactory.java @@ -67,6 +67,8 @@ public ShimUnicodePropertyFactory(IndexUnicodeProperties factory) { oldValue == null ? UTF16.valueOf(cp) : oldValue); break; case "Bidi_Paired_Bracket": + // The default is in PropertyValueAliases.txt, but TUP incorrectly + // has it as U+0000. prop = replaceValues(prop, oldValue -> oldValue == null ? "\u0000" : oldValue); break; case "FC_NFKC_Closure": @@ -76,9 +78,6 @@ public ShimUnicodePropertyFactory(IndexUnicodeProperties factory) { replaceCpValues( prop, (cp, oldValue) -> fixFC_NFKC_Closure(cp, oldValue)); - break; - case "Jamo_Short_Name": - prop = modifyJamo_Short_Name(prop); break; case "Name": // TUP reports the special label as the value of the Name @@ -315,11 +314,6 @@ private String fixFC_NFKC_Closure(int cp, String oldValue) { } } - // Jamo_Short_Name needs fix in IUP - private UnicodeProperty modifyJamo_Short_Name(UnicodeProperty prop) { - return copyPropReplacingMap(prop, prop.getUnicodeMap().put('ᄋ', "")); - } - /** Very useful. May already be in ICU, but not sure. */ public boolean equalsString(int codepoint, String value) { return codepoint == value.codePointAt(0) diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java index 434f000d5..a30dab80d 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java @@ -17,10 +17,13 @@ import java.util.ArrayList; import java.util.Comparator; import java.util.HashMap; -import java.util.LinkedHashSet; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.TreeMap; +import java.util.function.Function; +import java.util.regex.Pattern; +import java.util.stream.Collectors; import org.unicode.cldr.draft.FileUtilities; import org.unicode.cldr.util.Tabber; import org.unicode.cldr.util.Tabber.HTMLTabber; @@ -141,6 +144,7 @@ private static BufferedReader getInputReader(String inputFile) throws IOExceptio * @throws IOException */ public static int testInvariants(String inputFile, boolean doRange) throws IOException { + TestUnicodeInvariants.doRange = doRange; parseErrorCount = 0; testFailureCount = 0; boolean showScript = false; @@ -238,7 +242,7 @@ public static int testInvariants(String inputFile, boolean doRange) throws IOExc showMapLine(line, pp); } else if (line.startsWith("Show")) { showLine(line, pp); - } else if (line.startsWith("EquivalencesOf")) { + } else if (line.startsWith("OnPairsOf")) { equivalencesLine(line, pp, lineNumber); } else { testLine(line, pp, lineNumber); @@ -275,12 +279,16 @@ static class PropertyComparison { private static void equivalencesLine(String line, ParsePosition pp, int lineNumber) throws ParseException { - pp.setIndex("EquivalencesOf".length()); + pp.setIndex("OnPairsOf".length()); final UnicodeSet domain = new UnicodeSet(line, pp, symbolTable); + expectToken(",", pp, line); + expectToken("EqualityOf", pp, line); final var leftProperty = CompoundProperty.of(LATEST_PROPS, line, pp); scan(PATTERN_WHITE_SPACE, line, pp, true); char relationOperator = line.charAt(pp.getIndex()); pp.setIndex(pp.getIndex() + 1); + scan(PATTERN_WHITE_SPACE, line, pp, true); + expectToken("EqualityOf", pp, line); final var rightProperty = CompoundProperty.of(LATEST_PROPS, line, pp); boolean leftShouldImplyRight = false; @@ -515,11 +523,22 @@ private static void inLine(ParsePosition pp, String line, int lineNumber) } } + private static void expectToken(String token, ParsePosition pp, String line) + throws ParseException { + scan(PATTERN_WHITE_SPACE, line, pp, true); + if (!line.substring(pp.getIndex()).startsWith(token)) { + throw new ParseException("Expected " + token, pp.getIndex()); + } + pp.setIndex(pp.getIndex() + token.length()); + scan(PATTERN_WHITE_SPACE, line, pp, true); + } + private static PropertyComparison getPropertyComparison(ParsePosition pp, String line) throws ParseException { final PropertyComparison propertyComparison = new PropertyComparison(); propertyComparison.valueSet = new UnicodeSet(line, pp, symbolTable); + expectToken(",", pp, line); propertyComparison.property1 = CompoundProperty.of(LATEST_PROPS, line, pp); final int cp = line.codePointAt(pp.getIndex()); if (cp != '=' && cp != '≠') { @@ -549,12 +568,15 @@ static class FilterOrProp { enum Type { filter, prop, - stringprop + stringprop, + sequenceTransformation, }; private Type type; private UnicodeProperty prop; private UnicodeSet filter; + private Function, List> sequenceTransformation; + private Function, String> sequenceReduction; } private static final UnicodeSet PROPCHARS = @@ -571,6 +593,86 @@ static UnicodeProperty of( propOrFilter.filter = parseUnicodeSet(line, pp); propOrFilter.type = FilterOrProp.Type.filter; result.propOrFilters.add(propOrFilter); + } else if (line.charAt(pp.getIndex()) == '(') { + final FilterOrProp propOrFilter = new FilterOrProp(); + final var matcher = + Pattern.compile("(\\( *([^ )]+)(?: +([^)]+))? *\\)).*") + .matcher(line.substring(pp.getIndex())); + if (!matcher.matches()) { + throw new IllegalArgumentException( + "Expected ( ), got " + + line.substring(pp.getIndex())); + } + propOrFilter.type = FilterOrProp.Type.sequenceTransformation; + final String expression = matcher.group(1); + final String operation = matcher.group(2); + final String args = matcher.group(3); + switch (operation) { + case "take": + { + final int count = Integer.parseInt(args); + propOrFilter.sequenceTransformation = s -> s.subList(0, count); + break; + } + case "drop": + { + final int count = Integer.parseInt(args); + propOrFilter.sequenceTransformation = + s -> s.subList(count, s.size()); + break; + } + case "delete-adjacent-duplicates": + { + propOrFilter.sequenceTransformation = + s -> { + if (s.isEmpty()) { + return s; + } + int j = 0; + for (int i = 1; i < s.size(); ++i) { + if (!Objects.equals(s.get(i), s.get(j))) { + s.set(++j, s.get(i)); + } + } + s.subList(j + 1, s.size()).clear(); + return s; + }; + break; + } + case "prepend": + { + propOrFilter.sequenceTransformation = + s -> { + s.add(0, args); + return s; + }; + break; + } + case "append": + { + propOrFilter.sequenceTransformation = + s -> { + s.add(args); + return s; + }; + break; + } + case "string-join": + { + propOrFilter.sequenceReduction = s -> String.join("", s); + break; + } + case "constant": + { + propOrFilter.sequenceReduction = s -> args; + break; + } + default: + throw new IllegalArgumentException( + "Unknown operation " + matcher.group(1)); + } + result.propOrFilters.add(propOrFilter); + pp.setIndex(pp.getIndex() + expression.length()); } else { final String propName = scan(PROPCHARS, line, pp, true); if (propName.length() > 0) { @@ -583,9 +685,11 @@ static UnicodeProperty of( "Can't create property for: " + propName); } propOrFilter.type = - propOrFilter.prop.getType() != UnicodeProperty.STRING - ? FilterOrProp.Type.prop - : FilterOrProp.Type.stringprop; + propOrFilter.prop.getType() == UnicodeProperty.STRING + || propOrFilter.prop.getType() + == UnicodeProperty.EXTENDED_STRING + ? FilterOrProp.Type.stringprop + : FilterOrProp.Type.prop; result.propOrFilters.add(propOrFilter); } else { break; @@ -629,13 +733,21 @@ protected List _getNameAliases(List result) { @Override protected String _getValue(int codepoint) { final StringBuffer buffer = new StringBuffer(); - String value = UTF16.valueOf(codepoint); + String value = Character.toString(codepoint); + List values = null; int cp; for (int i = propOrFilters.size() - 1; i >= 0; --i) { final FilterOrProp propOrFilter = propOrFilters.get(i); switch (propOrFilter.type) { case filter: + if (value == null) { + throw new IllegalArgumentException( + "Cannot apply filter " + + propOrFilter.filter.toString() + + " to sequence " + + values); + } buffer.setLength(0); for (int j = 0; j < value.length(); j += UTF16.getCharCount(cp)) { cp = UTF16.charAt(value, j); @@ -647,6 +759,13 @@ protected String _getValue(int codepoint) { value = buffer.toString(); break; case stringprop: + if (value == null) { + throw new IllegalArgumentException( + "Cannot apply string property " + + propOrFilter.prop.getName() + + " to sequence " + + values); + } buffer.setLength(0); for (int j = 0; j < value.length(); j += UTF16.getCharCount(cp)) { cp = UTF16.charAt(value, j); @@ -656,19 +775,53 @@ protected String _getValue(int codepoint) { value = buffer.toString(); break; case prop: - final LinkedHashSet values = new LinkedHashSet(); + if (value == null) { + throw new IllegalArgumentException( + "Cannot apply enumerated property " + + propOrFilter.prop.getName() + + " to sequence " + + values); + } + values = new ArrayList<>(); for (int j = 0; j < value.length(); j += UTF16.getCharCount(cp)) { cp = UTF16.charAt(value, j); final String value2 = propOrFilter.prop.getValue(cp); values.add(value2); } - if (values.size() == 0) { - value = ""; - } else if (values.size() == 1) { - value = values.iterator().next(); + value = null; + break; + case sequenceTransformation: + final boolean wasString = value != null; + if (wasString) { + values = + value.codePoints() + .mapToObj(Character::toString) + .collect( + Collectors.toCollection( + () -> new ArrayList<>())); + value = null; + } + if (propOrFilter.sequenceTransformation != null) { + values = propOrFilter.sequenceTransformation.apply(values); + if (wasString) { + value = String.join("", values); + values = null; + } } else { - value = values.toString(); + value = propOrFilter.sequenceReduction.apply(values); + values = null; } + break; + } + } + if (value == null) { + if (values.isEmpty()) { + return ""; + } else if (values.size() == 1) { + return values.get(0); + } else { + throw new IllegalArgumentException( + "Compound property must return a string, not sequence " + values); } } return value; diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt index 6a19a4f25..57cce5c62 100644 --- a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt +++ b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt @@ -49,32 +49,45 @@ # # If there is an error in the test, a comparison listing of the two sides of the relation is generated. ########################## -# In (=|≠) +# In , (=|≠) # # For each character in , verify that the result of applying the left # is (=|≠) the result of applying the right . -# is of the form ( | ) ("*" ( | ))? +# is of the form ( | ) ("*" ( | | ))? # It is the functional composition of the properties applied to strings, whereby # is used to filter the result. # for a string property is applied to each character, and the result concatenated # That is, cf("A1") is cf("A")+cf("1") = "a1" -# for an enumerated property, is applied to each character, and the result is a concatenated set. -# That is, gc("A1") is gc("A")+gc("1") = "Uppercase_LetterDecimal_Number" +# for an enumerated property is applied to each character, and the result is a sequence of +# strings. +# That is, gc("A1") is [gc("A"), gc("1")] = ["Uppercase_Letter", "Decimal_Number"] +# may be applied to a sequence of strings or to a string. On a string it +# operates on the code points and returns a string. +# The available operations are: +# (append ) +# (prepend ) +# (take ) +# (drop ) +# (delete-adjacent-duplicates) +# (string-join) +# (constant ) +# The result of the must be a string, a single-string sequence, or an empty sequence. # # Example: for of bc * \P{bc=NSM} * cf * dm, the result applied to Å (angstrom sign) are: # bc * \P{bc=NSM} * cf * dm ("Å") # bc * \P{bc=NSM} * cf ("A" + umlaut) # bc * \P{bc=NSM} ("a" + umlaut) # bc ("a") -# "Left" +# "Left_To_Right" # -# Example: In \p{dt=canonical} bc * \P{bc=NSM} * dm = bc * \P{bc=NSM} +# Example: In \p{dt=canonical}, (delete-adjacent-duplicates) * bc * \P{bc=NSM} * dm = bc * \P{bc=NSM} # This examines only those characters that have canonical compositions. For each such character X -# it gets the decomposition mapping of X, then filters out all NSM characters, then gets the Bidi_Class. -# It then tests that against the result of filtering out NSM characters from X, then getting the BIDI_Class. +# it gets the decomposition mapping of X, then filters out all NSM characters, then gets the Bidi_Class, +# then deduplicates runs of the same Bidi_Class. +# It then compares that with the result of filtering out NSM characters from X, then getting the Bidi_Class. # ########################## -# EquivalencesOf (⇐|⇔|⇒|⇍|⇎|⇏) +# OnPairsOf , EqualityOf (⇐|⇔|⇒|⇍|⇎|⇏) EqualityOf # # Verify that the equivalence of elements of up to the left is is implied, # equivalent, or implies equivalence up to the right , or verify the negation of these @@ -85,16 +98,16 @@ # Example: # The case-insensitive comparison of ASCII identifiers defined by comparing their uppercase # mappings is equivalent to that defined by comparing their lowercase mappings: -# EquivalencesOf \p{Block=Basic Latin} Uppercase_Mapping ⇔ Lowercase_Mapping +# OnPairsOf \p{Block=Basic Latin}, EqualityOf Uppercase_Mapping ⇔ EqualityOf Lowercase_Mapping # This is not true in the broader Latin script (in fact neither implication holds). -# EquivalencesOf \p{Script=Latin} Uppercase_Mapping ⇎ Lowercase_Mapping +# OnPairsOf \p{Script=Latin}, EqualityOf Uppercase_Mapping ⇎ EqualityOf Lowercase_Mapping # The simple and full case foldings do not define the same equivalence classes on these # strings: # Let $strings = [ {Straße} {STRASSE} {ᾠδή} {ὨΙΔΉ} {...} ] -# EquivalencesOf $strings Case_Folding ⇎ Simple_Case_Folding +# OnPairsOf $strings, EqualityOf Case_Folding ⇎ EqualityOf Simple_Case_Folding # Specifically, full case folding is coarser than simple case folding. -# EquivalencesOf $strings Case_Folding ⇏ Simple_Case_Folding -# EquivalencesOf $strings Case_Folding ⇐ Simple_Case_Folding +# OnPairsOf $strings, EqualityOf Case_Folding ⇏ EqualityOf Simple_Case_Folding +# OnPairsOf $strings, EqualityOf Case_Folding ⇐ EqualityOf Simple_Case_Folding ########################## # There is new syntax for testing UnicodeMaps # @@ -163,7 +176,7 @@ $gcAllLetters = [\p{gc=Ll}\p{gc=Lu}\p{gc=Lo}\p{gc=Lt}\p{gc=Lm}] \p{GC=unassigned} ⊆ \p{U-1:GC=unassigned} # Name Stability: The Unicode Name property value for any non-reserved code point will not be changed. In particular, once a character is encoded, its name will not be changed. -In \P{U-1:GC=Cn} name=U-1:name +In \P{U-1:GC=Cn}, name=U-1:name # Formal Name Alias Stability # TODO @@ -194,19 +207,19 @@ In \P{U-1:GC=Cn} name=U-1:name # Case Folding Stability: Caseless matching of Unicode strings used for identifiers is stable. # For each string S containing only assigned characters in a given Unicode version, toCasefold(toNFKC(S)) under that version is identical to toCasefold(toNFKC(S)) under any later version of Unicode. -In \P{U-1:gc=Cn} U-1:Case_Folding * U-1:toNFKC = Case_Folding * toNFKC +In \P{U-1:gc=Cn}, U-1:Case_Folding * U-1:toNFKC = Case_Folding * toNFKC # For each string S containing only characters with the property XID_Continue in a given Unicode version, toNFKC_Casefold(S) under that version is identical to toNFKC_Casefold(S) under any later version of Unicode. # If this is violated as a result of a change to Default_Ignorable_Code_Point, the derivation of NFKC_Casefold must special-case the affected characters to restore stability. -In \p{U-1:XID_Continue} U-1:NFKC_Casefold = NFKC_Casefold +In \p{U-1:XID_Continue}, U-1:NFKC_Casefold = NFKC_Casefold # The following is not guaranteed by the stability policy, indeed it does not hold as far back as 5.2, but we might want to know about it: -In \P{U-1:gc=Cn} U-1:NFKC_Casefold = NFKC_Casefold +In \P{U-1:gc=Cn}, U-1:NFKC_Casefold = NFKC_Casefold # Not yet a stability policy, but see https://www.unicode.org/L2/L2023/23005.htm#174-A11. # Simple counterparts of the above. -In \P{U-1:gc=Cn} U-1:Simple_Case_Folding * U-1:toNFKC = Simple_Case_Folding * toNFKC -In \p{U-1:XID_Continue} R-1:NFKC_Simple_Casefold = NFKC_Simple_Casefold +In \P{U-1:gc=Cn}, U-1:Simple_Case_Folding * U-1:toNFKC = Simple_Case_Folding * toNFKC +In \p{U-1:XID_Continue}, R-1:NFKC_Simple_Casefold = NFKC_Simple_Casefold # As above, this one would not be guaranteed by the stability policy. -In \P{U-1:gc=Cn} R-1:NFKC_Simple_Casefold = NFKC_Simple_Casefold +In \P{U-1:gc=Cn}, R-1:NFKC_Simple_Casefold = NFKC_Simple_Casefold # Case Pair Stability: If two characters form a case pair in a version of Unicode, they will remain a case pair in each subsequent version of Unicode. If two characters do not form a case pair in a version of Unicode, they will never become a case pair in any subsequent version of Unicode. # TODO @@ -241,13 +254,13 @@ Let $caseOverlap = [\u02B0-\u02B8\u02C0\u02C1\u02E0-\u02E4\u0345\u037A\u10FC\u1D # Stability: The property values for the bidirectional properties Bidi_Class and Bidi_Mirrored preserve canonical equivalence. # This test utilizes the fact that bc=NSM inherit behavior in the algorithm, so these are just filtered -In \p{dt=canonical} bc * \P{bc=NSM} * dm = bc * \P{bc=NSM} +In \p{dt=canonical}, (delete-adjacent-duplicates) * bc * \P{bc=NSM} * dm = bc * \P{bc=NSM} # Stability: The property values for the bidirectional properties Bidi_Class and Bidi_Mirrored preserve canonical equivalence. # This test utilizes the fact that bc=NSM inherit behavior in the algorithm, so these are just filtered # There are 6 special cases, all symmetric symbols—which are not mirrored—with a solidus overlay: Let $BMExclusions = [ ≠ ∤ ∦ ≢ ≭ ⫝̸ ] -In [\p{dt=canonical}-$BMExclusions] Bidi_M * \P{bc=NSM} * dm = Bidi_M * \P{bc=NSM} +In [\p{dt=canonical}-$BMExclusions], (delete-adjacent-duplicates) * Bidi_M * \P{bc=NSM} * dm = Bidi_M * \P{bc=NSM} # Additional BIDI invariant constants Let $AL_blocks = [\u0600-\u07BF \u0860-\u08FF \uFB50-\uFDCF \uFDF0-\uFDFF \uFE70-\uFEFF \U00010D00-\U00010D3F \U00010EC0-\U00010EFF \U00010F30-\U00010F6F \U0001EC70-\U0001ECBF \U0001ED00-\U0001ED4F \U0001EE00-\U0001EEFF] @@ -289,13 +302,13 @@ Let $gcMn_bcL = [\u0CBF\u0CC6\U00011A07\U00011A08\U00011C3F] \p{cf=/.../} ⊃ [] # Case folding is not the same as lowercasing: Cherokee case folds to uppercase. -In \p{sc=Cher} cf = uc -In \p{sc=Cher} scf = suc +In \p{sc=Cher}, cf = uc +In \p{sc=Cher}, scf = suc # Simple and full case foldings define the same equivalence classes on code points. # This used not to be true, but was rectified by # https://www.unicode.org/cgi-bin/GetL2Ref.pl?175-C19. -EquivalencesOf $codepoints Case_Folding ⇔ Simple_Case_Folding +OnPairsOf $codepoints, EqualityOf Case_Folding ⇔ EqualityOf Simple_Case_Folding # Stability: All characters with the Lowercase property and all characters with the Uppercase property have the Alphabetic property. \p{Alphabetic} ⊃ [\p{Uppercase} \p{Lowercase}] @@ -321,7 +334,7 @@ EquivalencesOf $codepoints Case_Folding ⇔ Simple_Case_Folding # Stability: Once a character is assigned, both its Name and its Jamo_Short_Name will never change. # Name is covered in Main policies -# TODO: Short Name +In \P{U-1:GC=Cn}, Jamo_Short_Name=U-1:Jamo_Short_Name # Stability: The Noncharacter_Code_Point property is an immutable code point property, which means that its property values for all Unicode code points will never change. \p{NChar} = \p{U-1:NChar} @@ -402,7 +415,7 @@ Let $identifier_extend = [\p{GC=Mn}\p{GC=Mc}\p{GC=Nd}\p{GC=Pc}] \p{CCC=/^([0-9][0-9]?|1[0-9][0-9]|2[0-4][0-9]|25[0-5])$/} = [\u0000-\U0010FFFF] # Stability: Once a character is assigned, its Canonical_Combining_Class will never change. -In \P{U-1:GC=Cn} ccc=U-1:ccc +In \P{U-1:GC=Cn}, ccc=U-1:ccc # Canonical decompositions (minus exclusions) must be identical across releases (also required by strong normalization stability), # except where a character and at least one character in its decomposition are both new in the release. @@ -410,23 +423,23 @@ Let $New_Decompositions = [[\p{Decomposition_Type=Canonical} - \p{Full_Compositi $New_Decompositions ⊆ \p{U-1:GC=Cn} # Stripping previously-unassigned characters from the current NFD does # something, that is, the decomposition contains newly-assigned characters. -In $New_Decompositions toNFD * \P{U-1:GC=Cn} ≠ toNFD +In $New_Decompositions, toNFD * \P{U-1:GC=Cn} ≠ toNFD Let $Unicode_13_Decompositions = [[\p{U13.0.0:Decomposition_Type=Canonical} - \p{U13.0.0:Full_Composition_Exclusion}] - [\p{U12.1.0:Decomposition_Type=Canonical} - \p{U12.1.0:Full_Composition_Exclusion}]] $Unicode_13_Decompositions ⊆ \p{U12.1.0:GC=Cn} -In $Unicode_13_Decompositions toNFD * \P{U12.1.0:GC=Cn} ≠ toNFD +In $Unicode_13_Decompositions, toNFD * \P{U12.1.0:GC=Cn} ≠ toNFD $Unicode_13_Decompositions = [\U00011938] $Unicode_13_Decompositions = [\p{Name=DIVES AKURU VOWEL SIGN O}] Let $Unicode_7_Decompositions = [[\p{U7.0.0:Decomposition_Type=Canonical} - \p{U7.0.0:Full_Composition_Exclusion}] - [\p{U6.3.0:Decomposition_Type=Canonical} - \p{U6.3.0:Full_Composition_Exclusion}]] $Unicode_7_Decompositions ⊆ \p{U6.3.0:GC=Cn} -In $Unicode_7_Decompositions toNFD * \P{U6.3.0:GC=Cn} ≠ toNFD +In $Unicode_7_Decompositions, toNFD * \P{U6.3.0:GC=Cn} ≠ toNFD $Unicode_7_Decompositions = [\U0001134B-\U0001134C \U000114BB-\U000114BC \U000114BE \U000115BA-\U000115BB] $Unicode_7_Decompositions ⊆ [\p{Name=/^(GRANTHA|TIRHUTA|SIDDHAM) VOWEL SIGN /}] Let $Unicode_6_1_Decompositions = [[\p{U6.1.0:Decomposition_Type=Canonical} - \p{U6.1.0:Full_Composition_Exclusion}] - [\p{U6.0.0:Decomposition_Type=Canonical} - \p{U6.0.0:Full_Composition_Exclusion}]] $Unicode_6_1_Decompositions ⊆ \p{U6.0.0:GC=Cn} -In $Unicode_6_1_Decompositions toNFD * \P{U6.0.0:GC=Cn} ≠ toNFD +In $Unicode_6_1_Decompositions, toNFD * \P{U6.0.0:GC=Cn} ≠ toNFD $Unicode_6_1_Decompositions = [\U0001112E-\U0001112F] $Unicode_6_1_Decompositions ⊆ [\p{Name=/^CHAKMA VOWEL SIGN /}] @@ -435,7 +448,7 @@ $Unicode_6_1_Decompositions ⊆ [\p{Name=/^CHAKMA VOWEL SIGN /}] # Stability: Canonical and compatibility mappings (Decomposition_Mapping property values) are always in canonical order, and the resulting recursive decomposition will also be in canonical order. # Note: We really mean the fixed point of Decomposition_Mapping on the left-hand side here. -In \P{Decomposition_Mapping=} Decomposition_Mapping * Decomposition_Mapping * Decomposition_Mapping = toNFKD +In \P{Decomposition_Mapping=}, Decomposition_Mapping * Decomposition_Mapping * Decomposition_Mapping = toNFKD # Stability: Canonical mappings (Decomposition_Mapping property values) are always limited either to a single value or to a pair. The second character in the pair cannot itself have a canonical mapping. [ \p{Decomposition_Type=Canonical} & \p{Decomposition_Mapping=/.../} ] = [] @@ -447,7 +460,7 @@ In \P{Decomposition_Mapping=} Decomposition_Mapping * Decomposition_Mapping * De \p{toNFC=/.../} ⊃ [] # Stability: Once a character is assigned, its Decomposition_Mapping will never change. -In \P{U-1:GC=Cn} dm=U-1:dm +In \P{U-1:GC=Cn}, dm=U-1:dm # U6.0: Construction of Full_Composition_Exclusion # Primary Composites don't include singletons, ccc!=0, or sequences starting with ccc!=0 @@ -473,7 +486,7 @@ $combiningExclusions ⊇ [$firstNonStarter & \p{dt=canonical}] [\p{General_Category=Decimal_Number}\p{General_Category=Letter_Number}\p{General_Category=Other_Number}] ∥ \p{Numeric_Type=None} -# 133-C25] The distinction between numeric type = digit and and numeric type = numeric is not useful. +# [133-C25] The distinction between numeric type = digit and and numeric type = numeric is not useful. # No new characters will be assigned the numeric type "digit" \p{Numeric_Type=Digit} = \p{U-1:Numeric_Type=Digit} @@ -556,12 +569,12 @@ Let $nonAlphabeticAvagrahas = [\N{TIBETAN MARK PALUTA}] # A punctuation mark. ## Joining_Type and Joining_Group # Where defined, the Joining_Group refines the Joining_Type. -EquivalencesOf \P{Joining_Group=No_Joining_Group} Joining_Group ⇒ Joining_Type +OnPairsOf \P{Joining_Group=No_Joining_Group}, EqualityOf Joining_Group ⇒ EqualityOf Joining_Type \p{gc=Mn} ⊆ \p{Joining_Type=Transparent} \p{gc=Me} ⊆ \p{Joining_Type=Transparent} # Derivation of Joining_Type from the second column of ArabicShaping.txt (unofficially Other_Joining_Type). -In \P{Other_Joining_Type=Deduce_From_General_Category} Joining_Type = Other_Joining_Type +In \P{Other_Joining_Type=Deduce_From_General_Category}, Joining_Type = Other_Joining_Type [ \p{Other_Joining_Type=Deduce_From_General_Category} & [\p{gc=Mn}\p{gc=Me}\p{gc=Cf}] ] ⊆ \p{Joining_Type=Transparent} [ \p{Other_Joining_Type=Deduce_From_General_Category} - [\p{gc=Mn}\p{gc=Me}\p{gc=Cf}] ] ⊆ \p{Joining_Type=Non_Joining} @@ -965,9 +978,7 @@ Let $nonIdeographicStrokes = \p{Name=/^CJK STROKE (T|WG|XG|BXG|SW|HZZ|HP|HZWG|SZ \P{Equivalent_Unified_Ideograph=@none@} ⊆ $strokesAndRadicals [$strokesAndRadicals - \P{Equivalent_Unified_Ideograph=@none@}] = [$nonIdeographicStrokes $nonIdeographicRadicals] -# TODO(egg): NFC_Quick_Check is a stupid way to get a Yes here; we are checking -# that Equivalent_Unified_Ideograph values are single unified ideographs. -In \P{Equivalent_Unified_Ideograph=@none@} Unified_Ideograph * Equivalent_Unified_Ideograph = NFC_Quick_Check +In \P{Equivalent_Unified_Ideograph=@none@}, Unified_Ideograph * Equivalent_Unified_Ideograph = (constant Yes) # InPC-InSC-gc invariants # See https://www.unicode.org/L2/L2023/23200-category-invariants.pdf. @@ -979,4 +990,8 @@ In \P{Equivalent_Unified_Ideograph=@none@} Unified_Ideograph * Equivalent_Unifie # Script Extensions (mostly testing the proper handling of multivalued properties). \p{sc=Deva} ⊂ \p{scx=Deva} -[\p{scx=Deva} & \p{scx=Beng}] ⊃ [] \ No newline at end of file +[\p{scx=Deva} & \p{scx=Beng}] ⊃ [] + +# Hangul Syllable Name Generation, +# https://www.unicode.org/versions/latest/ch03.pdf#G59675. +In [\p{Block=Hangul Syllables} - \p{gc=Cn}], (prepend HANGUL SYLLABLE ) * (string-join) * Jamo_Short_Name * toNFD = Name \ No newline at end of file