From 9fdc70ef08ad2fd19e483bd005fd90e0ffa863bd Mon Sep 17 00:00:00 2001 From: Roozbeh Pournader Date: Fri, 24 May 2024 16:19:56 -0700 Subject: [PATCH 01/10] Fix typo in DoNotEmit.txt comments (#837) The file header had a misspelling: Egyptial instead of Egyptian. unicode-org/properties#295 (comment) --- unicodetools/data/ucd/dev/DoNotEmit.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unicodetools/data/ucd/dev/DoNotEmit.txt b/unicodetools/data/ucd/dev/DoNotEmit.txt index db6ac621b..43f4eaa09 100644 --- a/unicodetools/data/ucd/dev/DoNotEmit.txt +++ b/unicodetools/data/ucd/dev/DoNotEmit.txt @@ -1,5 +1,5 @@ # DoNotEmit-16.0.0.txt -# Date: 2024-03-18, 09:28:00 GMT +# Date: 2024-05-24, 11:01:00 GMT # © 2024 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use and license, see https://www.unicode.org/terms_of_use.html @@ -83,7 +83,7 @@ # Hamza_Form: # Sequences containing Arabic hamza above, which should be avoided. # Precomposed_Hieroglyph: -# Precomposed sequences for Egyptial Hieroglyphs which should be avoided. +# Precomposed sequences for Egyptian Hieroglyphs which should be avoided. # Precomposed_Form: # Sequences for which a precomposed form exists, but without canonical # equivalence. From 32c5ef9608b8b940428ec0d9aa3c6b11f16473a3 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Fri, 31 May 2024 17:20:24 +0200 Subject: [PATCH 02/10] Fix some bugs in the splitting of the invariants (#840) * null * blarg * blaaargh * Put the . in the extension Co-authored-by: Markus Scherer * suffix for testInvariants * docs --------- Co-authored-by: Markus Scherer --- docs/build.md | 4 +++- .../text/UCD/TestUnicodeInvariants.java | 22 ++++++++++--------- .../text/UCD/TestTestUnicodeInvariants.java | 5 +++-- 3 files changed, 18 insertions(+), 13 deletions(-) diff --git a/docs/build.md b/docs/build.md index 4f5b25173..cd7a62971 100644 --- a/docs/build.md +++ b/docs/build.md @@ -550,7 +550,7 @@ We no longer post files to FTP folders, nor publish individual files without con * org.unicode.text.UCD * TestUnicodeInvariants.java 1. Run>Run As... Java Application\ - Will create the following file of results: + Will create the following files of results: ``` {Generated}/UnicodeTestResults.txt ``` @@ -623,6 +623,8 @@ We no longer post files to FTP folders, nor publish individual files without con and what are likely remedies (changing properties, adding to an exceptions list, changing the test case). Improve these comments as needed. +1. Additional tests for UTS #39 data are found in [unicodetools/src/main/resources/org/unicode/text/UCD/SecurityInvariantTest.txt](https://github.com/unicode-org/unicodetools/blob/main/unicodetools/src/main/resources/org/unicode/text/UCD/SecurityInvariantTest.txt). + 1. These are reported in `{Generated}/UnicodeTestResults-security.txt` when running `TestTestUnicodeInvariants`. ### Options diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java index d122441e5..6bfdc0edf 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java @@ -77,7 +77,7 @@ public static void main(String[] args) throws IOException { System.out.println("HTML?\t" + doHtml); - testInvariants(file, doRange); + testInvariants(file, null, doRange); } static Transliterator toHTML; @@ -124,26 +124,26 @@ enum Expected { /** * Fetch a reader for our input data. * - * @param inputFile if null, read DEFAULT_FILE from classpath + * @param inputFile read from classpath * @return BufferedReader * @throws IOException */ private static BufferedReader getInputReader(String inputFile) throws IOException { - if (inputFile != null) { - return FileUtilities.openUTF8Reader(Settings.SRC_UCD_DIR, inputFile); - } - - // null: read it from resource data - return FileUtilities.openFile(TestUnicodeInvariants.class, DEFAULT_FILE); + return FileUtilities.openFile(TestUnicodeInvariants.class, inputFile); } /** * @param inputFile file to input, defaults to DEFAULT_FILE + * @param suffix Suffix for the test results report file, added after a hyphen if non-null. * @param doRange normally true * @return number of failures (0 is better) * @throws IOException */ - public static int testInvariants(String inputFile, boolean doRange) throws IOException { + public static int testInvariants(String inputFile, String suffix, boolean doRange) + throws IOException { + if (inputFile == null) { + inputFile = DEFAULT_FILE; + } TestUnicodeInvariants.doRange = doRange; parseErrorCount = 0; testFailureCount = 0; @@ -151,7 +151,9 @@ public static int testInvariants(String inputFile, boolean doRange) throws IOExc try (final PrintWriter out2 = FileUtilities.openUTF8Writer( Settings.Output.GEN_DIR, - "UnicodeTestResults." + (doHtml ? "html" : "txt"))) { + "UnicodeTestResults" + + (suffix == null ? "" : "-" + suffix) + + (doHtml ? ".html" : ".txt"))) { final StringWriter writer = new StringWriter(); try (PrintWriter out3 = new PrintWriter(writer)) { out = out3; diff --git a/unicodetools/src/test/java/org/unicode/text/UCD/TestTestUnicodeInvariants.java b/unicodetools/src/test/java/org/unicode/text/UCD/TestTestUnicodeInvariants.java index f74750a59..f8a578325 100644 --- a/unicodetools/src/test/java/org/unicode/text/UCD/TestTestUnicodeInvariants.java +++ b/unicodetools/src/test/java/org/unicode/text/UCD/TestTestUnicodeInvariants.java @@ -31,13 +31,14 @@ void testSRC_UCD_DIR() { @Test void testUnicodeInvariants() throws IOException { - int rc = TestUnicodeInvariants.testInvariants(null, true); + int rc = TestUnicodeInvariants.testInvariants(null, null, true); assertEquals(0, rc, "TestUnicodeInvariants.testInvariants(default) failed"); } @Test void testSecurityInvariants() throws IOException { - int rc = TestUnicodeInvariants.testInvariants("SecurityInvariantTest.txt", true); + int rc = + TestUnicodeInvariants.testInvariants("SecurityInvariantTest.txt", "security", true); assertEquals(0, rc, "TestUnicodeInvariants.testInvariants(security) failed"); } } From 263c3aac25a4221dd3bf7fc85ff142c1fd830f86 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Fri, 31 May 2024 20:49:08 +0200 Subject: [PATCH 03/10] =?UTF-8?q?Propertywise=20=E2=80=A6=20AreAlike=20(#8?= =?UTF-8?q?42)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * This should not be a test but it seems useful. * null * blarg * blaaargh * Put the . in the extension Co-authored-by: Markus Scherer * suffix for testInvariants * docs * Propertywise … AreAlike in the invariant language * Revert "Hani CJK strokes (#836)" to check that the new comparison test fails This reverts commit af5b2d9377f91f04c15873c8e700097db24bd3c1. * Another reference * Remove early eggsperiment * Remove trace * Revert "Revert "Hani CJK strokes (#836)" to check that the new comparison test fails" This reverts commit 04cc2b03b9c8990fc9952c5b337218e98ee93190. * Placeholder heading * Don’t compare first with itself. Co-authored-by: Markus Scherer * no inner loop --------- Co-authored-by: Markus Scherer --- .../unicode/props/IndexUnicodeProperties.java | 3 + .../text/UCD/TestUnicodeInvariants.java | 84 +++++++++++++++++++ .../props/ExtraPropertyValueAliases.txt | 6 ++ .../unicode/text/UCD/AdditionComparisons.txt | 19 +++++ .../unicode/text/UCD/UnicodeInvariantTest.txt | 18 ++++ .../text/UCD/TestTestUnicodeInvariants.java | 8 ++ 6 files changed, 138 insertions(+) create mode 100644 unicodetools/src/main/resources/org/unicode/text/UCD/AdditionComparisons.txt diff --git a/unicodetools/src/main/java/org/unicode/props/IndexUnicodeProperties.java b/unicodetools/src/main/java/org/unicode/props/IndexUnicodeProperties.java index e0f0cf172..ad1479668 100644 --- a/unicodetools/src/main/java/org/unicode/props/IndexUnicodeProperties.java +++ b/unicodetools/src/main/java/org/unicode/props/IndexUnicodeProperties.java @@ -471,6 +471,9 @@ public synchronized UnicodeMap load(UcdProperty prop2, boolean expectCac final String fileName = fileInfo.getFileName(ucdVersion); if (FILE_CACHE) { + // TODO(egg): When using cached property data, most defaults do not get + // loaded in PropertyParsingInfo, as that happens in parseSourceFile. + // Only the ones from the Extra files are loaded. data0 = getCachedMap(prop2, fullFilename); if (data0 != null) { property2UnicodeMap.put(prop2, data0.freeze()); diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java index 6bfdc0edf..320b7d120 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java @@ -17,9 +17,11 @@ import java.util.ArrayList; import java.util.Comparator; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Objects; +import java.util.Set; import java.util.TreeMap; import java.util.function.Function; import java.util.regex.Pattern; @@ -31,6 +33,8 @@ import org.unicode.jsp.ICUPropertyFactory; import org.unicode.props.BagFormatter; import org.unicode.props.IndexUnicodeProperties; +import org.unicode.props.IndexUnicodeProperties.DefaultValueType; +import org.unicode.props.UcdProperty; import org.unicode.props.UnicodeProperty; import org.unicode.props.UnicodeProperty.Factory; import org.unicode.text.utility.Settings; @@ -234,6 +238,8 @@ public static int testInvariants(String inputFile, String suffix, boolean doRang letLine(pp, line); } else if (line.startsWith("In")) { inLine(pp, line, inputFile, lineNumber); + } else if (line.startsWith("Propertywise")) { + propertywiseLine(pp, line, inputFile, lineNumber); } else if (line.startsWith("ShowScript")) { showScript = true; } else if (line.startsWith("HideScript")) { @@ -326,6 +332,84 @@ protected String getFailure(int codepoint) { } } + private static void propertywiseLine(ParsePosition pp, String line, String file, int lineNumber) + throws ParseException { + pp.setIndex("Propertywise".length()); + final UnicodeSet set = new UnicodeSet(line, pp, symbolTable); + if (set.hasStrings()) { + throw new ParseException( + "Set should contain only single code points for property comparison", + pp.getIndex()); + } + expectToken("AreAlike", pp, line); + if (pp.getIndex() < line.length()) { + expectToken(",", pp, line); + expectToken("Except", pp, line); + expectToken(":", pp, line); + } + Set excludedProperties = new HashSet<>(); + excludedProperties.add("Name"); + while (pp.getIndex() < line.length()) { + final int propertyNameStart = pp.getIndex(); + scan(PATTERN_WHITE_SPACE, line, pp, false); + excludedProperties.add(line.substring(propertyNameStart, pp.getIndex())); + scan(PATTERN_WHITE_SPACE, line, pp, true); + } + final var iup = IndexUnicodeProperties.make(Settings.latestVersion); + final List errorMessageLines = new ArrayList<>(); + for (var p : UcdProperty.values()) { + final var property = iup.getProperty(p); + if (property.getNameAliases().stream() + .anyMatch(alias -> excludedProperties.contains(alias))) { + continue; + } + final int first = set.charAt(0); + String p1 = property.getValue(first); + for (var range : set.ranges()) { + for (int c = range.codepoint; c <= range.codepointEnd; ++c) { + if (c == first) { + continue; + } + String p2 = property.getValue(c); + if (!Objects.equals(p1, p2)) { + if (IndexUnicodeProperties.getResolvedDefaultValueType(p) + != DefaultValueType.CODE_POINT + || !p1.equals(Character.toString(first)) + || !p2.equals(Character.toString(c))) { + errorMessageLines.add( + property.getName() + + "(" + + Character.toString(first) + + ")\t=\t" + + p1 + + "\t≠\t" + + p2 + + "\t=\t" + + property.getName() + + "(" + + Character.toString(c) + + ")"); + } + } + } + } + } + if (!errorMessageLines.isEmpty()) { + testFailureCount++; + printErrorLine("Test Failure", Side.START, testFailureCount); + reportTestFailure( + file, lineNumber, String.join("\n", errorMessageLines).replace('\t', ' ')); + out.println(""); + for (String errorMessageLine : errorMessageLines) { + out.println(""); + } + out.println("
"); + out.println(toHTML.transform(errorMessageLine).replace("\t", "")); + out.println("
"); + printErrorLine("Test Failure", Side.END, testFailureCount); + } + } + private static void equivalencesLine(String line, ParsePosition pp, String file, int lineNumber) throws ParseException { pp.setIndex("OnPairsOf".length()); diff --git a/unicodetools/src/main/resources/org/unicode/props/ExtraPropertyValueAliases.txt b/unicodetools/src/main/resources/org/unicode/props/ExtraPropertyValueAliases.txt index 465c613c6..98613a31c 100644 --- a/unicodetools/src/main/resources/org/unicode/props/ExtraPropertyValueAliases.txt +++ b/unicodetools/src/main/resources/org/unicode/props/ExtraPropertyValueAliases.txt @@ -101,9 +101,15 @@ # Overrides for bugs # TODO(egg): These are specified in their respective files, we should not need them here. + # @missing: 0000..10FFFF; Bidi_Mirroring_Glyph; # @missing: 0000..10FFFF; Equivalent_Unified_Ideograph; +# At least the following two appear to be needed because of issues related to caching; +# See comments in IndexUnicodeProperties.java. +# @missing: 0000..10FFFF; NFKC_Casefold; +# @missing: 0000..10FFFF; NFKC_SCF; + # Extras # @missing: 0000..10FFFF; Idn_Status ; disallowed diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/AdditionComparisons.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/AdditionComparisons.txt new file mode 100644 index 000000000..57b2f5e96 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/text/UCD/AdditionComparisons.txt @@ -0,0 +1,19 @@ +# This file uses the invariant test language, but contains comparisons between +# new and pre-existing characters to aid in PAG review of encoding proposals. + +## Unicode 16.0 additions. + +# These comparisons were not in place when properties were initially assigned for the 16.0 +# répertoire. +# We note here the feedback about errors that would have been caught by them. + +# U+18CFF is a blank character for the Khitan Small Script; aside from looking blank, +# it is indistinguishable from other Khitan Small Script characters. See L2/23-065. +# In particular, it is ideographic: https://www.unicode.org/review/pri497/feedback.html#ID20240216140104. +Propertywise [\N{KHITAN SMALL SCRIPT CHARACTER-18CFF} \N{KHITAN SMALL SCRIPT CHARACTER-18B00}] AreAlike, Except: Age + +# HXG (briefly known as HZXG) and SZP are just like all the other CJK strokes. +# In particular, they are scx=Hani: https://www.unicode.org/review/pri502/feedback.html#ID20240523095709. +Propertywise [\N{CJK STROKE T} \N{CJK STROKE HXG}\N{CJK STROKE SZP}] AreAlike, Except: Age + +## Provisionally assigned. [placeholder for draft PRs] \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt index d9cbb53a4..3eb759238 100644 --- a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt +++ b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt @@ -112,6 +112,20 @@ # OnPairsOf $strings, EqualityOf Case_Folding ⇏ EqualityOf Simple_Case_Folding # OnPairsOf $strings, EqualityOf Case_Folding ⇐ EqualityOf Simple_Case_Folding ########################## +# Propertywise AreAlike [, Except: ] +# +# Checks that all property assignments of the code points in are the same, +# except for the Name property and any properties listed in the space-separated +# Except clause. +# For the purposes of this check, if all characters in are mapped to themselves +# by some property with default value , these assignments are the same. +# +# Example: Propertywise [𐛪 𐛫] AreAlike +# These two Linear A signs (A751 and A752) behave identically. +# Example: Propertywise [ي ۑ] AreAlike, Except: Confusable_MA Unicode_1_Name +# This checks that yeh (with two dots) and yeh with three dots behave the same, +# except for confusability and their name in Unicode 1 (both have one, so it is different). +########################## # There is new syntax for testing UnicodeMaps # # Map @@ -1091,3 +1105,7 @@ In [\p{Block=Hangul Syllables} - \p{gc=Cn}], (prepend HANGUL SYLLABLE ) * (strin # https://www.unicode.org/review/pri497/feedback.html#ID20240216135149. In \p{Decomposition_Type=font}, Bidi_Class = Bidi_Class * Decomposition_Mapping +# Basic Propertywise tests. +Propertywise [𐛪 𐛫] AreAlike +Propertywise [ي ۑ] AreAlike, Except: Confusable_MA Unicode_1_Name + diff --git a/unicodetools/src/test/java/org/unicode/text/UCD/TestTestUnicodeInvariants.java b/unicodetools/src/test/java/org/unicode/text/UCD/TestTestUnicodeInvariants.java index f8a578325..9fe411807 100644 --- a/unicodetools/src/test/java/org/unicode/text/UCD/TestTestUnicodeInvariants.java +++ b/unicodetools/src/test/java/org/unicode/text/UCD/TestTestUnicodeInvariants.java @@ -35,6 +35,14 @@ void testUnicodeInvariants() throws IOException { assertEquals(0, rc, "TestUnicodeInvariants.testInvariants(default) failed"); } + @Test + void testAdditionComparisons() throws IOException { + int rc = + TestUnicodeInvariants.testInvariants( + "AdditionComparisons.txt", "addition-comparisons", true); + assertEquals(0, rc, "TestUnicodeInvariants.testInvariants(addition-comparisons) failed"); + } + @Test void testSecurityInvariants() throws IOException { int rc = From df3b57f3b7dac00635b1296bbfc57986a3d28eb8 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Tue, 4 Jun 2024 10:25:39 +0200 Subject: [PATCH 04/10] NamesList update for UTC-164-A42. (#843) [TXT] NamesList-16.0.0d24.txt 2024-06-03 14:56 1.9M Co-authored-by: Ken Whistler --- unicodetools/data/ucd/dev/NamesList.txt | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/unicodetools/data/ucd/dev/NamesList.txt b/unicodetools/data/ucd/dev/NamesList.txt index ea2436c02..4598934ff 100644 --- a/unicodetools/data/ucd/dev/NamesList.txt +++ b/unicodetools/data/ucd/dev/NamesList.txt @@ -1,23 +1,11 @@ ; charset=UTF-8 @@@ The Unicode Standard 16.0.0 @@@+ NamesList-16.0.0.txt -@+ Generation Date: 2024-05-17, 19:24:02 GMT +@+ Generation Date: 2024-06-03, 12:53:56 GMT Unicode 16.0.0 names list. Repertoire synched with UnicodeData-16.0.0d16.txt. - Pre-beta rollup of various fixes. - Add xref between 131A6 and 13DEE. - Add xrefs between 01C3, A71D, 107B9. - Added xrefs from 1DF0A to A71D and 107B9. - Added formal aliases and annotation for 1E899, 1E89A - Removed unneeded subheads for two postponed archaic SHRII characters. - Added formal alias for 12327. - Added alias and annotation for 12326. - Added xrefs between 050F and 1C8A. - Added an annotation about Amerindian orthographic use for 00B7. - Add notices about use of colon in Egyptian hieroglyph annotations. - Add annotation for 0B35; update annotation for 0B55. - Add annotation for 1DF8. - Add formal name alias for 1680B. + Post-beta rollup of various fixes. + Add subheads and annotations for 1FB81, 1FB98, 1FB99. This file is semi-automatically derived from UnicodeData.txt and a set of manually created annotations using a script to select or suppress information from the data file. The rules used @@ -64096,7 +64084,10 @@ FFFF 1FB7E RIGHT AND UPPER ONE EIGHTH BLOCK 1FB7F RIGHT AND LOWER ONE EIGHTH BLOCK 1FB80 UPPER AND LOWER ONE EIGHTH BLOCK +@ Window title bar +@+ This character is a legacy graphic used to draw the title bar of the active window. The lines corresponding to 3 and 5 are not actually block elements, but can show any horizontally repeating pattern. 1FB81 HORIZONTAL ONE EIGHTH BLOCK-1358 +@ Block elements 1FB82 UPPER ONE QUARTER BLOCK x (lower one quarter block - 2582) 1FB83 UPPER THREE EIGHTHS BLOCK @@ -64138,6 +64129,8 @@ FFFF * upper middle and lower one quarter block x (geta mark - 3013) x (block octant-3478 - 1CDB7) +@ Diagonal fill characters +@+ The filled area for these diagonal fill characters typically covers between one quarter and one half the total area. The diagonal lines should be of uniform width. 1FB98 UPPER LEFT TO LOWER RIGHT FILL x (square with upper left to lower right fill - 25A7) 1FB99 UPPER RIGHT TO LOWER LEFT FILL From be21de54d4e7e80713a73a0812e28d040a287e9a Mon Sep 17 00:00:00 2001 From: Mark Davis Date: Tue, 4 Jun 2024 19:07:21 -0700 Subject: [PATCH 05/10] Address confusable AIs for 16.0 (#841) --- docs/security.md | 54 +++++++--- .../data/security/dev/confusables.txt | 99 ++++++++++--------- .../data/security/dev/confusablesSummary.txt | 35 +++++-- .../dev/data/confusablesSummaryIdentifier.txt | 12 ++- .../dev/data/source/confusables-source.txt | 37 ++++++- .../dev/data/source/formatted-source.txt | 18 +++- 6 files changed, 184 insertions(+), 71 deletions(-) diff --git a/docs/security.md b/docs/security.md index 3229c35a9..01bb8955c 100644 --- a/docs/security.md +++ b/docs/security.md @@ -9,8 +9,8 @@ machine-generated, then tweaked. They have names like source/confusables-winFonts.txt. The main file is confusables-source.txt. ***There is fairly complex processing for the confusables, so carefully diff the -results. Sometimes you may get an unexpected union of two equivalence sets. Look -at Testing below for help.*** +results. Sometimes you may get an unexpected union of two equivalence sets. +Look at Testing below for help.*** Look at the following spreadsheets / bugs to see if there are any additional suggestions. @@ -19,17 +19,38 @@ suggestions. Suggestions](https://docs.google.com/spreadsheet/ccc?key=0ArRWBHdd5mx-dHRXelRVbXRYSVp2QTNDdTBlV1I5X1E&usp=drive_web#gid=0)** * **[Identifier Restriction Suggestions](https://docs.google.com/spreadsheet/ccc?key=0ArRWBHdd5mx-dEJJWkdzZzk4cDRYbEVLTmhraGN0Q3c&usp=drive_web#gid=0)** -* *[Unicode - Bugs](http://www.unicode.org/edcom/bugtrack/query?status=accepted&status=assigned&status=new&status=reopened&group=component&order=priority&col=id&col=summary&col=status&col=type&col=priority&col=milestone&col=component&owner=mark&report=10) - (under TR #36/39)*\ - :construction: **TODO**: That Trac instance is gone. - Markus thinks we decided that there was nothing useful in it, - and deleted it without saving data. Check with Mark. +* *[Sample PRs](https://github.com/unicode-org/unicodetools/pull/841) If so, assess and add to unicodetools/data/security/{version}/data/source/confusables-source.txt — *if needed.* - Then in the spreadsheets, move the "new stuff" line to the end. +### File Format +There is a brief description of the file format at the top. +Each line represents a mapping from a code point or set of code points to a sequence of one or more code points. + +For example: +``` +0021 ; 01C3 # ( ! → ǃ) EXCLAMATION MARK → LATIN LETTER RETROFLEX CLICK +``` + +The ordering of characters doesn't matter. +So it doesn't matter whether you have the above line, or +``` +01C3 ; 0021 # ( ǃ → !) LATIN LETTER RETROFLEX CLICK → EXCLAMATION MARK +``` +It also doesn't matter if you have identical lines; the second one will be a NOOP. + +The mappings are used to generate equivalence classes. +From each equivalence class, one representative member will be chosen, +and in the resulting data file, all the other characters will map to that representative. +Because of transitivity, the equivalence class will tend to be somewhat looser than expected. + +We've discussed possible future enhancements: +- Have a second, narrower mapping that is more exact. +- Allow for mappings from sequences to sequences (instead of just code points to sequences). +- Provide for context, perhaps like the Transform rules. + Eg [x { a } y → A](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%3Aarabic_type%3A%5D&g=&i=) + ## Before generating First, in CLDR, update the script metadata: @@ -51,13 +72,10 @@ Run GenerateConfusables -c -b to generate the files. They will appear in two pla * reformatted source, log * $UNICODETOOLS_DIR/data/security/11.0.0/* *including log.txt* -**Run TestSecurity to verify that the confusable mappings are idempotent!** +The TestSecurity.java test is part of the unit test suite, run by a github CI. +It verifies that the confusable mappings are idempotent. -With the same VM arguments as the generator. -Starting in 2021q3, TestSecurity needs to be run as a JUnit test. -It is also now part of the unit test suite and run on GitHub CI. - -Copy the following from the output directory to the top level of the revision directory: +Copy the following from the output directory to the top level of the revision directory, and check in. * confusables.txt * confusablesSummary.txt @@ -66,6 +84,12 @@ Copy the following from the output directory to the top level of the revision di * ReadMe.txt * xidmodifications.txt +### Review + +Review the mappings to make sure that there are no surprises. +The biggest issue is if two equivalence classes are mistakenly joined. +For example, if you map b to d, then that will join the equivalence class for b with that of d. + ### IdentifierStatus.txt & IdentifierType.txt Markus 2020-feb-07 for Unicode 13.0: diff --git a/unicodetools/data/security/dev/confusables.txt b/unicodetools/data/security/dev/confusables.txt index 531fd2a7f..ecbd58c23 100644 --- a/unicodetools/data/security/dev/confusables.txt +++ b/unicodetools/data/security/dev/confusables.txt @@ -1,5 +1,5 @@ # confusables.txt -# Date: 2024-05-03, 03:26:41 GMT +# Date: 2024-05-31, 21:12:55 GMT # © 2024 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use and license, see https://www.unicode.org/terms_of_use.html @@ -577,10 +577,10 @@ FF07 ; 0027 ; MA #* ( ' → ' ) FULLWIDTH APOSTROPHE → APOSTROPHE # →’ 2018 ; 0027 ; MA #* ( ‘ → ' ) LEFT SINGLE QUOTATION MARK → APOSTROPHE # 2019 ; 0027 ; MA #* ( ’ → ' ) RIGHT SINGLE QUOTATION MARK → APOSTROPHE # 201B ; 0027 ; MA #* ( ‛ → ' ) SINGLE HIGH-REVERSED-9 QUOTATION MARK → APOSTROPHE # →′→ +05F3 ; 0027 ; MA #* ( ‎׳‎ → ' ) HEBREW PUNCTUATION GERESH → APOSTROPHE # 2032 ; 0027 ; MA #* ( ′ → ' ) PRIME → APOSTROPHE # 2035 ; 0027 ; MA #* ( ‵ → ' ) REVERSED PRIME → APOSTROPHE # →ʽ→→‘→ 055A ; 0027 ; MA #* ( ՚ → ' ) ARMENIAN APOSTROPHE → APOSTROPHE # →’→ -05F3 ; 0027 ; MA #* ( ‎׳‎ → ' ) HEBREW PUNCTUATION GERESH → APOSTROPHE # 0060 ; 0027 ; MA #* ( ` → ' ) GRAVE ACCENT → APOSTROPHE # →ˋ→→`→→‘→ 1FEF ; 0027 ; MA #* ( ` → ' ) GREEK VARIA → APOSTROPHE # →ˋ→→`→→‘→ FF40 ; 0027 ; MA #* ( ` → ' ) FULLWIDTH GRAVE ACCENT → APOSTROPHE # →‘→ @@ -593,7 +593,7 @@ FF40 ; 0027 ; MA #* ( ` → ' ) FULLWIDTH GRAVE ACCENT → APOSTROPHE # →‘ 02B9 ; 0027 ; MA # ( ʹ → ' ) MODIFIER LETTER PRIME → APOSTROPHE # 0374 ; 0027 ; MA # ( ʹ → ' ) GREEK NUMERAL SIGN → APOSTROPHE # →′→ 02C8 ; 0027 ; MA # ( ˈ → ' ) MODIFIER LETTER VERTICAL LINE → APOSTROPHE # -02CA ; 0027 ; MA # ( ˊ → ' ) MODIFIER LETTER ACUTE ACCENT → APOSTROPHE # →ʹ→→′→ +02CA ; 0027 ; MA # ( ˊ → ' ) MODIFIER LETTER ACUTE ACCENT → APOSTROPHE # →΄→→ʹ→ 02CB ; 0027 ; MA # ( ˋ → ' ) MODIFIER LETTER GRAVE ACCENT → APOSTROPHE # →`→→‘→ 02F4 ; 0027 ; MA #* ( ˴ → ' ) MODIFIER LETTER MIDDLE GRAVE ACCENT → APOSTROPHE # →ˋ→→`→→‘→ 02BB ; 0027 ; MA # ( ʻ → ' ) MODIFIER LETTER TURNED COMMA → APOSTROPHE # →‘→ @@ -615,10 +615,10 @@ FF02 ; 0027 0027 ; MA #* ( " → '' ) FULLWIDTH QUOTATION MARK → APOSTROPHE, 201C ; 0027 0027 ; MA #* ( “ → '' ) LEFT DOUBLE QUOTATION MARK → APOSTROPHE, APOSTROPHE # →"→ 201D ; 0027 0027 ; MA #* ( ” → '' ) RIGHT DOUBLE QUOTATION MARK → APOSTROPHE, APOSTROPHE # →"→ 201F ; 0027 0027 ; MA #* ( ‟ → '' ) DOUBLE HIGH-REVERSED-9 QUOTATION MARK → APOSTROPHE, APOSTROPHE # →“→→"→ +05F4 ; 0027 0027 ; MA #* ( ‎״‎ → '' ) HEBREW PUNCTUATION GERSHAYIM → APOSTROPHE, APOSTROPHE # →"→ 2033 ; 0027 0027 ; MA #* ( ″ → '' ) DOUBLE PRIME → APOSTROPHE, APOSTROPHE # →"→ 2036 ; 0027 0027 ; MA #* ( ‶ → '' ) REVERSED DOUBLE PRIME → APOSTROPHE, APOSTROPHE # →‵‵→ 3003 ; 0027 0027 ; MA #* ( 〃 → '' ) DITTO MARK → APOSTROPHE, APOSTROPHE # →″→→"→ -05F4 ; 0027 0027 ; MA #* ( ‎״‎ → '' ) HEBREW PUNCTUATION GERSHAYIM → APOSTROPHE, APOSTROPHE # →"→ 02DD ; 0027 0027 ; MA #* ( ˝ → '' ) DOUBLE ACUTE ACCENT → APOSTROPHE, APOSTROPHE # →"→ 02BA ; 0027 0027 ; MA # ( ʺ → '' ) MODIFIER LETTER DOUBLE PRIME → APOSTROPHE, APOSTROPHE # →"→ 02F6 ; 0027 0027 ; MA #* ( ˶ → '' ) MODIFIER LETTER MIDDLE DOUBLE ACUTE ACCENT → APOSTROPHE, APOSTROPHE # →˝→→"→ @@ -1417,6 +1417,7 @@ A9C6 ; A9D0 ; MA #* ( ꧆ → ꧐ ) JAVANESE PADA WINDU → JAVANESE DIGIT ZERO 23E8 ; 2081 2080 ; MA #* ( ⏨ → ₁₀ ) DECIMAL EXPONENT SYMBOL → SUBSCRIPT ONE, SUBSCRIPT ZERO # +1CCF2 ; 0032 ; MA # ( 𜳲 → 2 ) OUTLINED DIGIT TWO → DIGIT TWO # 1D7D0 ; 0032 ; MA # ( 𝟐 → 2 ) MATHEMATICAL BOLD DIGIT TWO → DIGIT TWO # 1D7DA ; 0032 ; MA # ( 𝟚 → 2 ) MATHEMATICAL DOUBLE-STRUCK DIGIT TWO → DIGIT TWO # 1D7E4 ; 0032 ; MA # ( 𝟤 → 2 ) MATHEMATICAL SANS-SERIF DIGIT TWO → DIGIT TWO # @@ -1429,7 +1430,6 @@ A75A ; 0032 ; MA # ( Ꝛ → 2 ) LATIN CAPITAL LETTER R ROTUNDA → DIGIT TWO # A644 ; 0032 ; MA # ( Ꙅ → 2 ) CYRILLIC CAPITAL LETTER REVERSED DZE → DIGIT TWO # →Ƨ→ 14BF ; 0032 ; MA # ( ᒿ → 2 ) CANADIAN SYLLABICS SAYISI M → DIGIT TWO # A6EF ; 0032 ; MA # ( ꛯ → 2 ) BAMUM LETTER KOGHOM → DIGIT TWO # →Ƨ→ -1CCF2 ; 0032 ; MA # ( 𜳲 → 2 ) OUTLINED DIGIT TWO → DIGIT TWO # A9CF ; 0662 ; MA # ( ꧏ → ‎٢‎ ) JAVANESE PANGRANGKEP → ARABIC-INDIC DIGIT TWO # 06F2 ; 0662 ; MA # ( ۲ → ‎٢‎ ) EXTENDED ARABIC-INDIC DIGIT TWO → ARABIC-INDIC DIGIT TWO # @@ -1491,6 +1491,7 @@ A9CF ; 0662 ; MA # ( ꧏ → ‎٢‎ ) JAVANESE PANGRANGKEP → ARABIC-INDIC DI 335A ; 0032 70B9 ; MA #* ( ㍚ → 2点 ) IDEOGRAPHIC TELEGRAPH SYMBOL FOR HOUR TWO → DIGIT TWO, CJK UNIFIED IDEOGRAPH-70B9 # 1D206 ; 0033 ; MA #* ( 𝈆 → 3 ) GREEK VOCAL NOTATION SYMBOL-7 → DIGIT THREE # +1CCF3 ; 0033 ; MA # ( 𜳳 → 3 ) OUTLINED DIGIT THREE → DIGIT THREE # 1D7D1 ; 0033 ; MA # ( 𝟑 → 3 ) MATHEMATICAL BOLD DIGIT THREE → DIGIT THREE # 1D7DB ; 0033 ; MA # ( 𝟛 → 3 ) MATHEMATICAL DOUBLE-STRUCK DIGIT THREE → DIGIT THREE # 1D7E5 ; 0033 ; MA # ( 𝟥 → 3 ) MATHEMATICAL SANS-SERIF DIGIT THREE → DIGIT THREE # @@ -1506,7 +1507,6 @@ A76A ; 0033 ; MA # ( Ꝫ → 3 ) LATIN CAPITAL LETTER ET → DIGIT THREE # 04E0 ; 0033 ; MA # ( Ӡ → 3 ) CYRILLIC CAPITAL LETTER ABKHASIAN DZE → DIGIT THREE # →Ʒ→ 16F3B ; 0033 ; MA # ( 𖼻 → 3 ) MIAO LETTER ZA → DIGIT THREE # →Ʒ→ 118CA ; 0033 ; MA # ( 𑣊 → 3 ) WARANG CITI SMALL LETTER ANG → DIGIT THREE # -1CCF3 ; 0033 ; MA # ( 𜳳 → 3 ) OUTLINED DIGIT THREE → DIGIT THREE # 06F3 ; 0663 ; MA # ( ۳ → ‎٣‎ ) EXTENDED ARABIC-INDIC DIGIT THREE → ARABIC-INDIC DIGIT THREE # 1E8C9 ; 0663 ; MA #* ( ‎𞣉‎ → ‎٣‎ ) MENDE KIKAKUI DIGIT THREE → ARABIC-INDIC DIGIT THREE # @@ -1531,6 +1531,7 @@ A76A ; 0033 ; MA # ( Ꝫ → 3 ) LATIN CAPITAL LETTER ET → DIGIT THREE # 335B ; 0033 70B9 ; MA #* ( ㍛ → 3点 ) IDEOGRAPHIC TELEGRAPH SYMBOL FOR HOUR THREE → DIGIT THREE, CJK UNIFIED IDEOGRAPH-70B9 # +1CCF4 ; 0034 ; MA # ( 𜳴 → 4 ) OUTLINED DIGIT FOUR → DIGIT FOUR # 1D7D2 ; 0034 ; MA # ( 𝟒 → 4 ) MATHEMATICAL BOLD DIGIT FOUR → DIGIT FOUR # 1D7DC ; 0034 ; MA # ( 𝟜 → 4 ) MATHEMATICAL DOUBLE-STRUCK DIGIT FOUR → DIGIT FOUR # 1D7E6 ; 0034 ; MA # ( 𝟦 → 4 ) MATHEMATICAL SANS-SERIF DIGIT FOUR → DIGIT FOUR # @@ -1539,7 +1540,6 @@ A76A ; 0033 ; MA # ( Ꝫ → 3 ) LATIN CAPITAL LETTER ET → DIGIT THREE # 1FBF4 ; 0034 ; MA # ( 🯴 → 4 ) SEGMENTED DIGIT FOUR → DIGIT FOUR # 13CE ; 0034 ; MA # ( Ꮞ → 4 ) CHEROKEE LETTER SE → DIGIT FOUR # 118AF ; 0034 ; MA # ( 𑢯 → 4 ) WARANG CITI CAPITAL LETTER UC → DIGIT FOUR # -1CCF4 ; 0034 ; MA # ( 𜳴 → 4 ) OUTLINED DIGIT FOUR → DIGIT FOUR # 06F4 ; 0664 ; MA # ( ۴ → ‎٤‎ ) EXTENDED ARABIC-INDIC DIGIT FOUR → ARABIC-INDIC DIGIT FOUR # @@ -1559,6 +1559,7 @@ A76A ; 0033 ; MA # ( Ꝫ → 3 ) LATIN CAPITAL LETTER ET → DIGIT THREE # 335C ; 0034 70B9 ; MA #* ( ㍜ → 4点 ) IDEOGRAPHIC TELEGRAPH SYMBOL FOR HOUR FOUR → DIGIT FOUR, CJK UNIFIED IDEOGRAPH-70B9 # +1CCF5 ; 0035 ; MA # ( 𜳵 → 5 ) OUTLINED DIGIT FIVE → DIGIT FIVE # 1D7D3 ; 0035 ; MA # ( 𝟓 → 5 ) MATHEMATICAL BOLD DIGIT FIVE → DIGIT FIVE # 1D7DD ; 0035 ; MA # ( 𝟝 → 5 ) MATHEMATICAL DOUBLE-STRUCK DIGIT FIVE → DIGIT FIVE # 1D7E7 ; 0035 ; MA # ( 𝟧 → 5 ) MATHEMATICAL SANS-SERIF DIGIT FIVE → DIGIT FIVE # @@ -1567,7 +1568,6 @@ A76A ; 0033 ; MA # ( Ꝫ → 3 ) LATIN CAPITAL LETTER ET → DIGIT THREE # 1FBF5 ; 0035 ; MA # ( 🯵 → 5 ) SEGMENTED DIGIT FIVE → DIGIT FIVE # 01BC ; 0035 ; MA # ( Ƽ → 5 ) LATIN CAPITAL LETTER TONE FIVE → DIGIT FIVE # 118BB ; 0035 ; MA # ( 𑢻 → 5 ) WARANG CITI CAPITAL LETTER HORR → DIGIT FIVE # -1CCF5 ; 0035 ; MA # ( 𜳵 → 5 ) OUTLINED DIGIT FIVE → DIGIT FIVE # 2464 ; 2784 ; MA #* ( ⑤ → ➄ ) CIRCLED DIGIT FIVE → DINGBAT CIRCLED SANS-SERIF DIGIT FIVE # @@ -1581,6 +1581,7 @@ A76A ; 0033 ; MA # ( Ꝫ → 3 ) LATIN CAPITAL LETTER ET → DIGIT THREE # 335D ; 0035 70B9 ; MA #* ( ㍝ → 5点 ) IDEOGRAPHIC TELEGRAPH SYMBOL FOR HOUR FIVE → DIGIT FIVE, CJK UNIFIED IDEOGRAPH-70B9 # +1CCF6 ; 0036 ; MA # ( 𜳶 → 6 ) OUTLINED DIGIT SIX → DIGIT SIX # 1D7D4 ; 0036 ; MA # ( 𝟔 → 6 ) MATHEMATICAL BOLD DIGIT SIX → DIGIT SIX # 1D7DE ; 0036 ; MA # ( 𝟞 → 6 ) MATHEMATICAL DOUBLE-STRUCK DIGIT SIX → DIGIT SIX # 1D7E8 ; 0036 ; MA # ( 𝟨 → 6 ) MATHEMATICAL SANS-SERIF DIGIT SIX → DIGIT SIX # @@ -1591,7 +1592,6 @@ A76A ; 0033 ; MA # ( Ꝫ → 3 ) LATIN CAPITAL LETTER ET → DIGIT THREE # 0431 ; 0036 ; MA # ( б → 6 ) CYRILLIC SMALL LETTER BE → DIGIT SIX # 13EE ; 0036 ; MA # ( Ꮾ → 6 ) CHEROKEE LETTER WV → DIGIT SIX # 118D5 ; 0036 ; MA # ( 𑣕 → 6 ) WARANG CITI SMALL LETTER AT → DIGIT SIX # -1CCF6 ; 0036 ; MA # ( 𜳶 → 6 ) OUTLINED DIGIT SIX → DIGIT SIX # 06F6 ; 0666 ; MA # ( ۶ → ‎٦‎ ) EXTENDED ARABIC-INDIC DIGIT SIX → ARABIC-INDIC DIGIT SIX # @@ -1610,6 +1610,7 @@ A76A ; 0033 ; MA # ( Ꝫ → 3 ) LATIN CAPITAL LETTER ET → DIGIT THREE # 335E ; 0036 70B9 ; MA #* ( ㍞ → 6点 ) IDEOGRAPHIC TELEGRAPH SYMBOL FOR HOUR SIX → DIGIT SIX, CJK UNIFIED IDEOGRAPH-70B9 # 1D212 ; 0037 ; MA #* ( 𝈒 → 7 ) GREEK VOCAL NOTATION SYMBOL-19 → DIGIT SEVEN # +1CCF7 ; 0037 ; MA # ( 𜳷 → 7 ) OUTLINED DIGIT SEVEN → DIGIT SEVEN # 1D7D5 ; 0037 ; MA # ( 𝟕 → 7 ) MATHEMATICAL BOLD DIGIT SEVEN → DIGIT SEVEN # 1D7DF ; 0037 ; MA # ( 𝟟 → 7 ) MATHEMATICAL DOUBLE-STRUCK DIGIT SEVEN → DIGIT SEVEN # 1D7E9 ; 0037 ; MA # ( 𝟩 → 7 ) MATHEMATICAL SANS-SERIF DIGIT SEVEN → DIGIT SEVEN # @@ -1618,7 +1619,6 @@ A76A ; 0033 ; MA # ( Ꝫ → 3 ) LATIN CAPITAL LETTER ET → DIGIT THREE # 1FBF7 ; 0037 ; MA # ( 🯷 → 7 ) SEGMENTED DIGIT SEVEN → DIGIT SEVEN # 104D2 ; 0037 ; MA # ( 𐓒 → 7 ) OSAGE CAPITAL LETTER ZA → DIGIT SEVEN # 118C6 ; 0037 ; MA # ( 𑣆 → 7 ) WARANG CITI SMALL LETTER II → DIGIT SEVEN # -1CCF7 ; 0037 ; MA # ( 𜳷 → 7 ) OUTLINED DIGIT SEVEN → DIGIT SEVEN # 2466 ; 2786 ; MA #* ( ⑦ → ➆ ) CIRCLED DIGIT SEVEN → DINGBAT CIRCLED SANS-SERIF DIGIT SEVEN # @@ -1636,6 +1636,7 @@ A76A ; 0033 ; MA # ( Ꝫ → 3 ) LATIN CAPITAL LETTER ET → DIGIT THREE # 09EA ; 0038 ; MA # ( ৪ → 8 ) BENGALI DIGIT FOUR → DIGIT EIGHT # 0A6A ; 0038 ; MA # ( ੪ → 8 ) GURMUKHI DIGIT FOUR → DIGIT EIGHT # 1E8CB ; 0038 ; MA #* ( ‎𞣋‎ → 8 ) MENDE KIKAKUI DIGIT FIVE → DIGIT EIGHT # +1CCF8 ; 0038 ; MA # ( 𜳸 → 8 ) OUTLINED DIGIT EIGHT → DIGIT EIGHT # 1D7D6 ; 0038 ; MA # ( 𝟖 → 8 ) MATHEMATICAL BOLD DIGIT EIGHT → DIGIT EIGHT # 1D7E0 ; 0038 ; MA # ( 𝟠 → 8 ) MATHEMATICAL DOUBLE-STRUCK DIGIT EIGHT → DIGIT EIGHT # 1D7EA ; 0038 ; MA # ( 𝟪 → 8 ) MATHEMATICAL SANS-SERIF DIGIT EIGHT → DIGIT EIGHT # @@ -1645,7 +1646,6 @@ A76A ; 0033 ; MA # ( Ꝫ → 3 ) LATIN CAPITAL LETTER ET → DIGIT THREE # 0223 ; 0038 ; MA # ( ȣ → 8 ) LATIN SMALL LETTER OU → DIGIT EIGHT # 0222 ; 0038 ; MA # ( Ȣ → 8 ) LATIN CAPITAL LETTER OU → DIGIT EIGHT # 1031A ; 0038 ; MA # ( 𐌚 → 8 ) OLD ITALIC LETTER EF → DIGIT EIGHT # -1CCF8 ; 0038 ; MA # ( 𜳸 → 8 ) OUTLINED DIGIT EIGHT → DIGIT EIGHT # 0AEE ; 096E ; MA # ( ૮ → ८ ) GUJARATI DIGIT EIGHT → DEVANAGARI DIGIT EIGHT # @@ -1665,6 +1665,7 @@ A76A ; 0033 ; MA # ( Ꝫ → 3 ) LATIN CAPITAL LETTER ET → DIGIT THREE # 0B68 ; 0039 ; MA # ( ୨ → 9 ) ORIYA DIGIT TWO → DIGIT NINE # 09ED ; 0039 ; MA # ( ৭ → 9 ) BENGALI DIGIT SEVEN → DIGIT NINE # 0D6D ; 0039 ; MA # ( ൭ → 9 ) MALAYALAM DIGIT SEVEN → DIGIT NINE # +1CCF9 ; 0039 ; MA # ( 𜳹 → 9 ) OUTLINED DIGIT NINE → DIGIT NINE # 1D7D7 ; 0039 ; MA # ( 𝟗 → 9 ) MATHEMATICAL BOLD DIGIT NINE → DIGIT NINE # 1D7E1 ; 0039 ; MA # ( 𝟡 → 9 ) MATHEMATICAL DOUBLE-STRUCK DIGIT NINE → DIGIT NINE # 1D7EB ; 0039 ; MA # ( 𝟫 → 9 ) MATHEMATICAL SANS-SERIF DIGIT NINE → DIGIT NINE # @@ -1676,7 +1677,6 @@ A76E ; 0039 ; MA # ( Ꝯ → 9 ) LATIN CAPITAL LETTER CON → DIGIT NINE # 118CC ; 0039 ; MA # ( 𑣌 → 9 ) WARANG CITI SMALL LETTER KO → DIGIT NINE # 118AC ; 0039 ; MA # ( 𑢬 → 9 ) WARANG CITI CAPITAL LETTER KO → DIGIT NINE # 118D6 ; 0039 ; MA # ( 𑣖 → 9 ) WARANG CITI SMALL LETTER AM → DIGIT NINE # -1CCF9 ; 0039 ; MA # ( 𜳹 → 9 ) OUTLINED DIGIT NINE → DIGIT NINE # 0967 ; 0669 ; MA # ( १ → ‎٩‎ ) DEVANAGARI DIGIT ONE → ARABIC-INDIC DIGIT NINE # 118E4 ; 0669 ; MA # ( 𑣤 → ‎٩‎ ) WARANG CITI DIGIT FOUR → ARABIC-INDIC DIGIT NINE # @@ -1723,6 +1723,7 @@ FF41 ; 0061 ; MA # ( a → a ) FULLWIDTH LATIN SMALL LETTER A → LATIN SMALL 2DF6 ; 0363 ; MA # ( ⷶ → ͣ ) COMBINING CYRILLIC LETTER A → COMBINING LATIN SMALL LETTER A # FF21 ; 0041 ; MA # ( A → A ) FULLWIDTH LATIN CAPITAL LETTER A → LATIN CAPITAL LETTER A # →А→ +1CCD6 ; 0041 ; MA #* ( 𜳖 → A ) OUTLINED LATIN CAPITAL LETTER A → LATIN CAPITAL LETTER A # 1D400 ; 0041 ; MA # ( 𝐀 → A ) MATHEMATICAL BOLD CAPITAL A → LATIN CAPITAL LETTER A # 1D434 ; 0041 ; MA # ( 𝐴 → A ) MATHEMATICAL ITALIC CAPITAL A → LATIN CAPITAL LETTER A # 1D468 ; 0041 ; MA # ( 𝑨 → A ) MATHEMATICAL BOLD ITALIC CAPITAL A → LATIN CAPITAL LETTER A # @@ -1748,7 +1749,6 @@ FF21 ; 0041 ; MA # ( A → A ) FULLWIDTH LATIN CAPITAL LETTER A → LATIN CAPI A4EE ; 0041 ; MA # ( ꓮ → A ) LISU LETTER A → LATIN CAPITAL LETTER A # 16F40 ; 0041 ; MA # ( 𖽀 → A ) MIAO LETTER ZZYA → LATIN CAPITAL LETTER A # 102A0 ; 0041 ; MA # ( 𐊠 → A ) CARIAN LETTER A → LATIN CAPITAL LETTER A # -1CCD6 ; 0041 ; MA #* ( 𜳖 → A ) OUTLINED LATIN CAPITAL LETTER A → LATIN CAPITAL LETTER A # 2376 ; 0061 0332 ; MA #* ( ⍶ → a̲ ) APL FUNCTIONAL SYMBOL ALPHA UNDERBAR → LATIN SMALL LETTER A, COMBINING LOW LINE # →α̲→→ɑ̲→ @@ -1826,6 +1826,7 @@ A4EF ; 2C6F ; MA # ( ꓯ → Ɐ ) LISU LETTER AE → LATIN CAPITAL LETTER TURNE FF22 ; 0042 ; MA # ( B → B ) FULLWIDTH LATIN CAPITAL LETTER B → LATIN CAPITAL LETTER B # →Β→ 212C ; 0042 ; MA # ( ℬ → B ) SCRIPT CAPITAL B → LATIN CAPITAL LETTER B # +1CCD7 ; 0042 ; MA #* ( 𜳗 → B ) OUTLINED LATIN CAPITAL LETTER B → LATIN CAPITAL LETTER B # 1D401 ; 0042 ; MA # ( 𝐁 → B ) MATHEMATICAL BOLD CAPITAL B → LATIN CAPITAL LETTER B # 1D435 ; 0042 ; MA # ( 𝐵 → B ) MATHEMATICAL ITALIC CAPITAL B → LATIN CAPITAL LETTER B # 1D469 ; 0042 ; MA # ( 𝑩 → B ) MATHEMATICAL BOLD ITALIC CAPITAL B → LATIN CAPITAL LETTER B # @@ -1852,7 +1853,6 @@ A4D0 ; 0042 ; MA # ( ꓐ → B ) LISU LETTER BA → LATIN CAPITAL LETTER B # 10282 ; 0042 ; MA # ( 𐊂 → B ) LYCIAN LETTER B → LATIN CAPITAL LETTER B # 102A1 ; 0042 ; MA # ( 𐊡 → B ) CARIAN LETTER P2 → LATIN CAPITAL LETTER B # 10301 ; 0042 ; MA # ( 𐌁 → B ) OLD ITALIC LETTER BE → LATIN CAPITAL LETTER B # -1CCD7 ; 0042 ; MA #* ( 𜳗 → B ) OUTLINED LATIN CAPITAL LETTER B → LATIN CAPITAL LETTER B # 0253 ; 0062 0314 ; MA # ( ɓ → b̔ ) LATIN SMALL LETTER B WITH HOOK → LATIN SMALL LETTER B, COMBINING REVERSED COMMA ABOVE # @@ -1910,6 +1910,7 @@ FF23 ; 0043 ; MA # ( C → C ) FULLWIDTH LATIN CAPITAL LETTER C → LATIN CAPI 216D ; 0043 ; MA # ( Ⅽ → C ) ROMAN NUMERAL ONE HUNDRED → LATIN CAPITAL LETTER C # 2102 ; 0043 ; MA # ( ℂ → C ) DOUBLE-STRUCK CAPITAL C → LATIN CAPITAL LETTER C # 212D ; 0043 ; MA # ( ℭ → C ) BLACK-LETTER CAPITAL C → LATIN CAPITAL LETTER C # +1CCD8 ; 0043 ; MA #* ( 𜳘 → C ) OUTLINED LATIN CAPITAL LETTER C → LATIN CAPITAL LETTER C # 1D402 ; 0043 ; MA # ( 𝐂 → C ) MATHEMATICAL BOLD CAPITAL C → LATIN CAPITAL LETTER C # 1D436 ; 0043 ; MA # ( 𝐶 → C ) MATHEMATICAL ITALIC CAPITAL C → LATIN CAPITAL LETTER C # 1D46A ; 0043 ; MA # ( 𝑪 → C ) MATHEMATICAL BOLD ITALIC CAPITAL C → LATIN CAPITAL LETTER C # @@ -1930,7 +1931,6 @@ A4DA ; 0043 ; MA # ( ꓚ → C ) LISU LETTER CA → LATIN CAPITAL LETTER C # 10302 ; 0043 ; MA # ( 𐌂 → C ) OLD ITALIC LETTER KE → LATIN CAPITAL LETTER C # 10415 ; 0043 ; MA # ( 𐐕 → C ) DESERET CAPITAL LETTER CHEE → LATIN CAPITAL LETTER C # 1051C ; 0043 ; MA # ( 𐔜 → C ) ELBASAN LETTER SHE → LATIN CAPITAL LETTER C # -1CCD8 ; 0043 ; MA #* ( 𜳘 → C ) OUTLINED LATIN CAPITAL LETTER C → LATIN CAPITAL LETTER C # 00A2 ; 0063 0338 ; MA #* ( ¢ → c̸ ) CENT SIGN → LATIN SMALL LETTER C, COMBINING LONG SOLIDUS OVERLAY # 023C ; 0063 0338 ; MA # ( ȼ → c̸ ) LATIN SMALL LETTER C WITH STROKE → LATIN SMALL LETTER C, COMBINING LONG SOLIDUS OVERLAY # →¢→ @@ -2006,6 +2006,7 @@ A4D2 ; 0064 ; MA # ( ꓒ → d ) LISU LETTER PHA → LATIN SMALL LETTER D # 216E ; 0044 ; MA # ( Ⅾ → D ) ROMAN NUMERAL FIVE HUNDRED → LATIN CAPITAL LETTER D # 2145 ; 0044 ; MA # ( ⅅ → D ) DOUBLE-STRUCK ITALIC CAPITAL D → LATIN CAPITAL LETTER D # +1CCD9 ; 0044 ; MA #* ( 𜳙 → D ) OUTLINED LATIN CAPITAL LETTER D → LATIN CAPITAL LETTER D # 1D403 ; 0044 ; MA # ( 𝐃 → D ) MATHEMATICAL BOLD CAPITAL D → LATIN CAPITAL LETTER D # 1D437 ; 0044 ; MA # ( 𝐷 → D ) MATHEMATICAL ITALIC CAPITAL D → LATIN CAPITAL LETTER D # 1D46B ; 0044 ; MA # ( 𝑫 → D ) MATHEMATICAL BOLD ITALIC CAPITAL D → LATIN CAPITAL LETTER D # @@ -2023,7 +2024,6 @@ A4D2 ; 0064 ; MA # ( ꓒ → d ) LISU LETTER PHA → LATIN SMALL LETTER D # 15DE ; 0044 ; MA # ( ᗞ → D ) CANADIAN SYLLABICS CARRIER THE → LATIN CAPITAL LETTER D # 15EA ; 0044 ; MA # ( ᗪ → D ) CANADIAN SYLLABICS CARRIER PE → LATIN CAPITAL LETTER D # →ᗞ→ A4D3 ; 0044 ; MA # ( ꓓ → D ) LISU LETTER DA → LATIN CAPITAL LETTER D # -1CCD9 ; 0044 ; MA #* ( 𜳙 → D ) OUTLINED LATIN CAPITAL LETTER D → LATIN CAPITAL LETTER D # 0257 ; 0064 0314 ; MA # ( ɗ → d̔ ) LATIN SMALL LETTER D WITH HOOK → LATIN SMALL LETTER D, COMBINING REVERSED COMMA ABOVE # @@ -2099,6 +2099,7 @@ AB32 ; 0065 ; MA # ( ꬲ → e ) LATIN SMALL LETTER BLACKLETTER E → LATIN SMAL 22FF ; 0045 ; MA #* ( ⋿ → E ) Z NOTATION BAG MEMBERSHIP → LATIN CAPITAL LETTER E # FF25 ; 0045 ; MA # ( E → E ) FULLWIDTH LATIN CAPITAL LETTER E → LATIN CAPITAL LETTER E # →Ε→ 2130 ; 0045 ; MA # ( ℰ → E ) SCRIPT CAPITAL E → LATIN CAPITAL LETTER E # +1CCDA ; 0045 ; MA #* ( 𜳚 → E ) OUTLINED LATIN CAPITAL LETTER E → LATIN CAPITAL LETTER E # 1D404 ; 0045 ; MA # ( 𝐄 → E ) MATHEMATICAL BOLD CAPITAL E → LATIN CAPITAL LETTER E # 1D438 ; 0045 ; MA # ( 𝐸 → E ) MATHEMATICAL ITALIC CAPITAL E → LATIN CAPITAL LETTER E # 1D46C ; 0045 ; MA # ( 𝑬 → E ) MATHEMATICAL BOLD ITALIC CAPITAL E → LATIN CAPITAL LETTER E # @@ -2124,7 +2125,6 @@ A4F0 ; 0045 ; MA # ( ꓰ → E ) LISU LETTER E → LATIN CAPITAL LETTER E # 118A6 ; 0045 ; MA # ( 𑢦 → E ) WARANG CITI CAPITAL LETTER II → LATIN CAPITAL LETTER E # 118AE ; 0045 ; MA # ( 𑢮 → E ) WARANG CITI CAPITAL LETTER YUJ → LATIN CAPITAL LETTER E # 10286 ; 0045 ; MA # ( 𐊆 → E ) LYCIAN LETTER I → LATIN CAPITAL LETTER E # -1CCDA ; 0045 ; MA #* ( 𜳚 → E ) OUTLINED LATIN CAPITAL LETTER E → LATIN CAPITAL LETTER E # 011B ; 0115 ; MA # ( ě → ĕ ) LATIN SMALL LETTER E WITH CARON → LATIN SMALL LETTER E WITH BREVE # @@ -2195,6 +2195,7 @@ A799 ; 0066 ; MA # ( ꞙ → f ) LATIN SMALL LETTER F WITH STROKE → LATIN SMAL 1D213 ; 0046 ; MA #* ( 𝈓 → F ) GREEK VOCAL NOTATION SYMBOL-20 → LATIN CAPITAL LETTER F # →Ϝ→ 2131 ; 0046 ; MA # ( ℱ → F ) SCRIPT CAPITAL F → LATIN CAPITAL LETTER F # +1CCDB ; 0046 ; MA #* ( 𜳛 → F ) OUTLINED LATIN CAPITAL LETTER F → LATIN CAPITAL LETTER F # 1D405 ; 0046 ; MA # ( 𝐅 → F ) MATHEMATICAL BOLD CAPITAL F → LATIN CAPITAL LETTER F # 1D439 ; 0046 ; MA # ( 𝐹 → F ) MATHEMATICAL ITALIC CAPITAL F → LATIN CAPITAL LETTER F # 1D46D ; 0046 ; MA # ( 𝑭 → F ) MATHEMATICAL BOLD ITALIC CAPITAL F → LATIN CAPITAL LETTER F # @@ -2217,7 +2218,6 @@ A4DD ; 0046 ; MA # ( ꓝ → F ) LISU LETTER TSA → LATIN CAPITAL LETTER F # 10287 ; 0046 ; MA # ( 𐊇 → F ) LYCIAN LETTER W → LATIN CAPITAL LETTER F # 102A5 ; 0046 ; MA # ( 𐊥 → F ) CARIAN LETTER R → LATIN CAPITAL LETTER F # 10525 ; 0046 ; MA # ( 𐔥 → F ) ELBASAN LETTER GHE → LATIN CAPITAL LETTER F # -1CCDB ; 0046 ; MA #* ( 𜳛 → F ) OUTLINED LATIN CAPITAL LETTER F → LATIN CAPITAL LETTER F # 0192 ; 0066 0326 ; MA # ( ƒ → f̦ ) LATIN SMALL LETTER F WITH HOOK → LATIN SMALL LETTER F, COMBINING COMMA BELOW # →f̡→ @@ -2264,6 +2264,7 @@ FF47 ; 0067 ; MA # ( g → g ) FULLWIDTH LATIN SMALL LETTER G → LATIN SMALL 018D ; 0067 ; MA # ( ƍ → g ) LATIN SMALL LETTER TURNED DELTA → LATIN SMALL LETTER G # 0581 ; 0067 ; MA # ( ց → g ) ARMENIAN SMALL LETTER CO → LATIN SMALL LETTER G # +1CCDC ; 0047 ; MA #* ( 𜳜 → G ) OUTLINED LATIN CAPITAL LETTER G → LATIN CAPITAL LETTER G # 1D406 ; 0047 ; MA # ( 𝐆 → G ) MATHEMATICAL BOLD CAPITAL G → LATIN CAPITAL LETTER G # 1D43A ; 0047 ; MA # ( 𝐺 → G ) MATHEMATICAL ITALIC CAPITAL G → LATIN CAPITAL LETTER G # 1D46E ; 0047 ; MA # ( 𝑮 → G ) MATHEMATICAL BOLD ITALIC CAPITAL G → LATIN CAPITAL LETTER G # @@ -2281,7 +2282,6 @@ FF47 ; 0067 ; MA # ( g → g ) FULLWIDTH LATIN SMALL LETTER G → LATIN SMALL 13C0 ; 0047 ; MA # ( Ꮐ → G ) CHEROKEE LETTER NAH → LATIN CAPITAL LETTER G # 13F3 ; 0047 ; MA # ( Ᏻ → G ) CHEROKEE LETTER YU → LATIN CAPITAL LETTER G # A4D6 ; 0047 ; MA # ( ꓖ → G ) LISU LETTER GA → LATIN CAPITAL LETTER G # -1CCDC ; 0047 ; MA #* ( 𜳜 → G ) OUTLINED LATIN CAPITAL LETTER G → LATIN CAPITAL LETTER G # 1DA2 ; 1D4D ; MA # ( ᶢ → ᵍ ) MODIFIER LETTER SMALL SCRIPT G → MODIFIER LETTER SMALL G # @@ -2325,6 +2325,7 @@ FF28 ; 0048 ; MA # ( H → H ) FULLWIDTH LATIN CAPITAL LETTER H → LATIN CAPI 210B ; 0048 ; MA # ( ℋ → H ) SCRIPT CAPITAL H → LATIN CAPITAL LETTER H # 210C ; 0048 ; MA # ( ℌ → H ) BLACK-LETTER CAPITAL H → LATIN CAPITAL LETTER H # 210D ; 0048 ; MA # ( ℍ → H ) DOUBLE-STRUCK CAPITAL H → LATIN CAPITAL LETTER H # +1CCDD ; 0048 ; MA #* ( 𜳝 → H ) OUTLINED LATIN CAPITAL LETTER H → LATIN CAPITAL LETTER H # 1D407 ; 0048 ; MA # ( 𝐇 → H ) MATHEMATICAL BOLD CAPITAL H → LATIN CAPITAL LETTER H # 1D43B ; 0048 ; MA # ( 𝐻 → H ) MATHEMATICAL ITALIC CAPITAL H → LATIN CAPITAL LETTER H # 1D46F ; 0048 ; MA # ( 𝑯 → H ) MATHEMATICAL BOLD ITALIC CAPITAL H → LATIN CAPITAL LETTER H # @@ -2347,7 +2348,6 @@ FF28 ; 0048 ; MA # ( H → H ) FULLWIDTH LATIN CAPITAL LETTER H → LATIN CAPI 157C ; 0048 ; MA # ( ᕼ → H ) CANADIAN SYLLABICS NUNAVUT H → LATIN CAPITAL LETTER H # A4E7 ; 0048 ; MA # ( ꓧ → H ) LISU LETTER XA → LATIN CAPITAL LETTER H # 102CF ; 0048 ; MA # ( 𐋏 → H ) CARIAN LETTER E2 → LATIN CAPITAL LETTER H # -1CCDD ; 0048 ; MA #* ( 𜳝 → H ) OUTLINED LATIN CAPITAL LETTER H → LATIN CAPITAL LETTER H # 1D78 ; 1D34 ; MA # ( ᵸ → ᴴ ) MODIFIER LETTER CYRILLIC EN → MODIFIER LETTER CAPITAL H # @@ -2465,6 +2465,7 @@ FF4A ; 006A ; MA # ( j → j ) FULLWIDTH LATIN SMALL LETTER J → LATIN SMALL 0458 ; 006A ; MA # ( ј → j ) CYRILLIC SMALL LETTER JE → LATIN SMALL LETTER J # FF2A ; 004A ; MA # ( J → J ) FULLWIDTH LATIN CAPITAL LETTER J → LATIN CAPITAL LETTER J # →Ј→ +1CCDF ; 004A ; MA #* ( 𜳟 → J ) OUTLINED LATIN CAPITAL LETTER J → LATIN CAPITAL LETTER J # 1D409 ; 004A ; MA # ( 𝐉 → J ) MATHEMATICAL BOLD CAPITAL J → LATIN CAPITAL LETTER J # 1D43D ; 004A ; MA # ( 𝐽 → J ) MATHEMATICAL ITALIC CAPITAL J → LATIN CAPITAL LETTER J # 1D471 ; 004A ; MA # ( 𝑱 → J ) MATHEMATICAL BOLD ITALIC CAPITAL J → LATIN CAPITAL LETTER J # @@ -2484,7 +2485,6 @@ A7B2 ; 004A ; MA # ( Ʝ → J ) LATIN CAPITAL LETTER J WITH CROSSED-TAIL → LA 13AB ; 004A ; MA # ( Ꭻ → J ) CHEROKEE LETTER GU → LATIN CAPITAL LETTER J # 148D ; 004A ; MA # ( ᒍ → J ) CANADIAN SYLLABICS CO → LATIN CAPITAL LETTER J # A4D9 ; 004A ; MA # ( ꓙ → J ) LISU LETTER JA → LATIN CAPITAL LETTER J # -1CCDF ; 004A ; MA #* ( 𜳟 → J ) OUTLINED LATIN CAPITAL LETTER J → LATIN CAPITAL LETTER J # 0249 ; 006A 0335 ; MA # ( ɉ → j̵ ) LATIN SMALL LETTER J WITH STROKE → LATIN SMALL LETTER J, COMBINING SHORT STROKE OVERLAY # @@ -2513,6 +2513,7 @@ AB7B ; 1D0A ; MA # ( ꭻ → ᴊ ) CHEROKEE SMALL LETTER GU → LATIN LETTER SMA 212A ; 004B ; MA # ( K → K ) KELVIN SIGN → LATIN CAPITAL LETTER K # FF2B ; 004B ; MA # ( K → K ) FULLWIDTH LATIN CAPITAL LETTER K → LATIN CAPITAL LETTER K # →Κ→ +1CCE0 ; 004B ; MA #* ( 𜳠 → K ) OUTLINED LATIN CAPITAL LETTER K → LATIN CAPITAL LETTER K # 1D40A ; 004B ; MA # ( 𝐊 → K ) MATHEMATICAL BOLD CAPITAL K → LATIN CAPITAL LETTER K # 1D43E ; 004B ; MA # ( 𝐾 → K ) MATHEMATICAL ITALIC CAPITAL K → LATIN CAPITAL LETTER K # 1D472 ; 004B ; MA # ( 𝑲 → K ) MATHEMATICAL BOLD ITALIC CAPITAL K → LATIN CAPITAL LETTER K # @@ -2538,7 +2539,6 @@ FF2B ; 004B ; MA # ( K → K ) FULLWIDTH LATIN CAPITAL LETTER K → LATIN CAPI 16D5 ; 004B ; MA # ( ᛕ → K ) RUNIC LETTER OPEN-P → LATIN CAPITAL LETTER K # A4D7 ; 004B ; MA # ( ꓗ → K ) LISU LETTER KA → LATIN CAPITAL LETTER K # 10518 ; 004B ; MA # ( 𐔘 → K ) ELBASAN LETTER QE → LATIN CAPITAL LETTER K # -1CCE0 ; 004B ; MA #* ( 𜳠 → K ) OUTLINED LATIN CAPITAL LETTER K → LATIN CAPITAL LETTER K # 0199 ; 006B 0314 ; MA # ( ƙ → k̔ ) LATIN SMALL LETTER K WITH HOOK → LATIN SMALL LETTER K, COMBINING REVERSED COMMA ABOVE # @@ -2561,6 +2561,7 @@ FFE8 ; 006C ; MA #* ( │ → l ) HALFWIDTH FORMS LIGHT VERTICAL → LATIN SMALL 06F1 ; 006C ; MA # ( ۱ → l ) EXTENDED ARABIC-INDIC DIGIT ONE → LATIN SMALL LETTER L # →1→ 10320 ; 006C ; MA #* ( 𐌠 → l ) OLD ITALIC NUMERAL ONE → LATIN SMALL LETTER L # →𐌉→→I→ 1E8C7 ; 006C ; MA #* ( ‎𞣇‎ → l ) MENDE KIKAKUI DIGIT ONE → LATIN SMALL LETTER L # +1CCF1 ; 006C ; MA # ( 𜳱 → l ) OUTLINED DIGIT ONE → LATIN SMALL LETTER L # →1→ 1D7CF ; 006C ; MA # ( 𝟏 → l ) MATHEMATICAL BOLD DIGIT ONE → LATIN SMALL LETTER L # →1→ 1D7D9 ; 006C ; MA # ( 𝟙 → l ) MATHEMATICAL DOUBLE-STRUCK DIGIT ONE → LATIN SMALL LETTER L # →1→ 1D7E3 ; 006C ; MA # ( 𝟣 → l ) MATHEMATICAL SANS-SERIF DIGIT ONE → LATIN SMALL LETTER L # →1→ @@ -2572,6 +2573,7 @@ FF29 ; 006C ; MA # ( I → l ) FULLWIDTH LATIN CAPITAL LETTER I → LATIN SMAL 2160 ; 006C ; MA # ( Ⅰ → l ) ROMAN NUMERAL ONE → LATIN SMALL LETTER L # →Ӏ→ 2110 ; 006C ; MA # ( ℐ → l ) SCRIPT CAPITAL I → LATIN SMALL LETTER L # →I→ 2111 ; 006C ; MA # ( ℑ → l ) BLACK-LETTER CAPITAL I → LATIN SMALL LETTER L # →I→ +1CCDE ; 006C ; MA #* ( 𜳞 → l ) OUTLINED LATIN CAPITAL LETTER I → LATIN SMALL LETTER L # →I→ 1D408 ; 006C ; MA # ( 𝐈 → l ) MATHEMATICAL BOLD CAPITAL I → LATIN SMALL LETTER L # →I→ 1D43C ; 006C ; MA # ( 𝐼 → l ) MATHEMATICAL ITALIC CAPITAL I → LATIN SMALL LETTER L # →I→ 1D470 ; 006C ; MA # ( 𝑰 → l ) MATHEMATICAL BOLD ITALIC CAPITAL I → LATIN SMALL LETTER L # →I→ @@ -2624,12 +2626,11 @@ A4F2 ; 006C ; MA # ( ꓲ → l ) LISU LETTER I → LATIN SMALL LETTER L # →I 16F28 ; 006C ; MA # ( 𖼨 → l ) MIAO LETTER GHA → LATIN SMALL LETTER L # →I→ 1028A ; 006C ; MA # ( 𐊊 → l ) LYCIAN LETTER J → LATIN SMALL LETTER L # →I→ 10309 ; 006C ; MA # ( 𐌉 → l ) OLD ITALIC LETTER I → LATIN SMALL LETTER L # →I→ -1CCDE ; 006C ; MA #* ( 𜳞 → l ) OUTLINED LATIN CAPITAL LETTER I → LATIN SMALL LETTER L # →I→ -1CCF1 ; 006C ; MA # ( 𜳱 → l ) OUTLINED DIGIT ONE → LATIN SMALL LETTER L # →1→ 1D22A ; 004C ; MA #* ( 𝈪 → L ) GREEK INSTRUMENTAL NOTATION SYMBOL-23 → LATIN CAPITAL LETTER L # 216C ; 004C ; MA # ( Ⅼ → L ) ROMAN NUMERAL FIFTY → LATIN CAPITAL LETTER L # 2112 ; 004C ; MA # ( ℒ → L ) SCRIPT CAPITAL L → LATIN CAPITAL LETTER L # +1CCE1 ; 004C ; MA #* ( 𜳡 → L ) OUTLINED LATIN CAPITAL LETTER L → LATIN CAPITAL LETTER L # 1D40B ; 004C ; MA # ( 𝐋 → L ) MATHEMATICAL BOLD CAPITAL L → LATIN CAPITAL LETTER L # 1D43F ; 004C ; MA # ( 𝐿 → L ) MATHEMATICAL ITALIC CAPITAL L → LATIN CAPITAL LETTER L # 1D473 ; 004C ; MA # ( 𝑳 → L ) MATHEMATICAL BOLD ITALIC CAPITAL L → LATIN CAPITAL LETTER L # @@ -2651,7 +2652,6 @@ A4E1 ; 004C ; MA # ( ꓡ → L ) LISU LETTER LA → LATIN CAPITAL LETTER L # 118B2 ; 004C ; MA # ( 𑢲 → L ) WARANG CITI CAPITAL LETTER TTE → LATIN CAPITAL LETTER L # 1041B ; 004C ; MA # ( 𐐛 → L ) DESERET CAPITAL LETTER ETH → LATIN CAPITAL LETTER L # 10526 ; 004C ; MA # ( 𐔦 → L ) ELBASAN LETTER GHAMMA → LATIN CAPITAL LETTER L # -1CCE1 ; 004C ; MA #* ( 𜳡 → L ) OUTLINED LATIN CAPITAL LETTER L → LATIN CAPITAL LETTER L # FD3C ; 006C 030B ; MA # ( ‎ﴼ‎ → l̋ ) ARABIC LIGATURE ALEF WITH FATHATAN FINAL FORM → LATIN SMALL LETTER L, COMBINING DOUBLE ACUTE ACCENT # →‎اً‎→ FD3D ; 006C 030B ; MA # ( ‎ﴽ‎ → l̋ ) ARABIC LIGATURE ALEF WITH FATHATAN ISOLATED FORM → LATIN SMALL LETTER L, COMBINING DOUBLE ACUTE ACCENT # →‎اً‎→ @@ -2805,6 +2805,7 @@ ABAE ; 029F ; MA # ( ꮮ → ʟ ) CHEROKEE SMALL LETTER TLE → LATIN LETTER SMA FF2D ; 004D ; MA # ( M → M ) FULLWIDTH LATIN CAPITAL LETTER M → LATIN CAPITAL LETTER M # →Μ→ 216F ; 004D ; MA # ( Ⅿ → M ) ROMAN NUMERAL ONE THOUSAND → LATIN CAPITAL LETTER M # 2133 ; 004D ; MA # ( ℳ → M ) SCRIPT CAPITAL M → LATIN CAPITAL LETTER M # +1CCE2 ; 004D ; MA #* ( 𜳢 → M ) OUTLINED LATIN CAPITAL LETTER M → LATIN CAPITAL LETTER M # 1D40C ; 004D ; MA # ( 𝐌 → M ) MATHEMATICAL BOLD CAPITAL M → LATIN CAPITAL LETTER M # 1D440 ; 004D ; MA # ( 𝑀 → M ) MATHEMATICAL ITALIC CAPITAL M → LATIN CAPITAL LETTER M # 1D474 ; 004D ; MA # ( 𝑴 → M ) MATHEMATICAL BOLD ITALIC CAPITAL M → LATIN CAPITAL LETTER M # @@ -2832,7 +2833,6 @@ FF2D ; 004D ; MA # ( M → M ) FULLWIDTH LATIN CAPITAL LETTER M → LATIN CAPI A4DF ; 004D ; MA # ( ꓟ → M ) LISU LETTER MA → LATIN CAPITAL LETTER M # 102B0 ; 004D ; MA # ( 𐊰 → M ) CARIAN LETTER S → LATIN CAPITAL LETTER M # 10311 ; 004D ; MA # ( 𐌑 → M ) OLD ITALIC LETTER SHE → LATIN CAPITAL LETTER M # -1CCE2 ; 004D ; MA #* ( 𜳢 → M ) OUTLINED LATIN CAPITAL LETTER M → LATIN CAPITAL LETTER M # 04CD ; 004D 0326 ; MA # ( Ӎ → M̦ ) CYRILLIC CAPITAL LETTER EM WITH TAIL → LATIN CAPITAL LETTER M, COMBINING COMMA BELOW # →М̡→ @@ -2858,6 +2858,7 @@ A4DF ; 004D ; MA # ( ꓟ → M ) LISU LETTER MA → LATIN CAPITAL LETTER M # FF2E ; 004E ; MA # ( N → N ) FULLWIDTH LATIN CAPITAL LETTER N → LATIN CAPITAL LETTER N # →Ν→ 2115 ; 004E ; MA # ( ℕ → N ) DOUBLE-STRUCK CAPITAL N → LATIN CAPITAL LETTER N # +1CCE3 ; 004E ; MA #* ( 𜳣 → N ) OUTLINED LATIN CAPITAL LETTER N → LATIN CAPITAL LETTER N # 1D40D ; 004E ; MA # ( 𝐍 → N ) MATHEMATICAL BOLD CAPITAL N → LATIN CAPITAL LETTER N # 1D441 ; 004E ; MA # ( 𝑁 → N ) MATHEMATICAL ITALIC CAPITAL N → LATIN CAPITAL LETTER N # 1D475 ; 004E ; MA # ( 𝑵 → N ) MATHEMATICAL BOLD ITALIC CAPITAL N → LATIN CAPITAL LETTER N # @@ -2879,7 +2880,6 @@ FF2E ; 004E ; MA # ( N → N ) FULLWIDTH LATIN CAPITAL LETTER N → LATIN CAPI 2C9A ; 004E ; MA # ( Ⲛ → N ) COPTIC CAPITAL LETTER NI → LATIN CAPITAL LETTER N # A4E0 ; 004E ; MA # ( ꓠ → N ) LISU LETTER NA → LATIN CAPITAL LETTER N # 10513 ; 004E ; MA # ( 𐔓 → N ) ELBASAN LETTER NE → LATIN CAPITAL LETTER N # -1CCE3 ; 004E ; MA #* ( 𜳣 → N ) OUTLINED LATIN CAPITAL LETTER N → LATIN CAPITAL LETTER N # 1018E ; 004E 030A ; MA #* ( 𐆎 → N̊ ) NOMISMA SIGN → LATIN CAPITAL LETTER N, COMBINING RING ABOVE # →Νͦ→ @@ -2994,6 +2994,7 @@ FBA6 ; 006F ; MA # ( ‎ﮦ‎ → o ) ARABIC LETTER HEH GOAL ISOLATED FORM → 3007 ; 004F ; MA # ( 〇 → O ) IDEOGRAPHIC NUMBER ZERO → LATIN CAPITAL LETTER O # 114D0 ; 004F ; MA # ( 𑓐 → O ) TIRHUTA DIGIT ZERO → LATIN CAPITAL LETTER O # →০→→0→ 118E0 ; 004F ; MA # ( 𑣠 → O ) WARANG CITI DIGIT ZERO → LATIN CAPITAL LETTER O # →0→ +1CCF0 ; 004F ; MA # ( 𜳰 → O ) OUTLINED DIGIT ZERO → LATIN CAPITAL LETTER O # →0→ 1D7CE ; 004F ; MA # ( 𝟎 → O ) MATHEMATICAL BOLD DIGIT ZERO → LATIN CAPITAL LETTER O # →0→ 1D7D8 ; 004F ; MA # ( 𝟘 → O ) MATHEMATICAL DOUBLE-STRUCK DIGIT ZERO → LATIN CAPITAL LETTER O # →0→ 1D7E2 ; 004F ; MA # ( 𝟢 → O ) MATHEMATICAL SANS-SERIF DIGIT ZERO → LATIN CAPITAL LETTER O # →0→ @@ -3001,6 +3002,7 @@ FBA6 ; 006F ; MA # ( ‎ﮦ‎ → o ) ARABIC LETTER HEH GOAL ISOLATED FORM → 1D7F6 ; 004F ; MA # ( 𝟶 → O ) MATHEMATICAL MONOSPACE DIGIT ZERO → LATIN CAPITAL LETTER O # →0→ 1FBF0 ; 004F ; MA # ( 🯰 → O ) SEGMENTED DIGIT ZERO → LATIN CAPITAL LETTER O # →0→ FF2F ; 004F ; MA # ( O → O ) FULLWIDTH LATIN CAPITAL LETTER O → LATIN CAPITAL LETTER O # →О→ +1CCE4 ; 004F ; MA #* ( 𜳤 → O ) OUTLINED LATIN CAPITAL LETTER O → LATIN CAPITAL LETTER O # 1D40E ; 004F ; MA # ( 𝐎 → O ) MATHEMATICAL BOLD CAPITAL O → LATIN CAPITAL LETTER O # 1D442 ; 004F ; MA # ( 𝑂 → O ) MATHEMATICAL ITALIC CAPITAL O → LATIN CAPITAL LETTER O # 1D476 ; 004F ; MA # ( 𝑶 → O ) MATHEMATICAL BOLD ITALIC CAPITAL O → LATIN CAPITAL LETTER O # @@ -3033,8 +3035,6 @@ A4F3 ; 004F ; MA # ( ꓳ → O ) LISU LETTER O → LATIN CAPITAL LETTER O # 102AB ; 004F ; MA # ( 𐊫 → O ) CARIAN LETTER O → LATIN CAPITAL LETTER O # 10404 ; 004F ; MA # ( 𐐄 → O ) DESERET CAPITAL LETTER LONG O → LATIN CAPITAL LETTER O # 10516 ; 004F ; MA # ( 𐔖 → O ) ELBASAN LETTER O → LATIN CAPITAL LETTER O # -1CCE4 ; 004F ; MA #* ( 𜳤 → O ) OUTLINED LATIN CAPITAL LETTER O → LATIN CAPITAL LETTER O # -1CCF0 ; 004F ; MA # ( 𜳰 → O ) OUTLINED DIGIT ZERO → LATIN CAPITAL LETTER O # →0→ 2070 ; 00BA ; MA #* ( ⁰ → º ) SUPERSCRIPT ZERO → MASCULINE ORDINAL INDICATOR # 1D52 ; 00BA ; MA # ( ᵒ → º ) MODIFIER LETTER SMALL O → MASCULINE ORDINAL INDICATOR # →⁰→ @@ -3202,6 +3202,7 @@ FF50 ; 0070 ; MA # ( p → p ) FULLWIDTH LATIN SMALL LETTER P → LATIN SMALL FF30 ; 0050 ; MA # ( P → P ) FULLWIDTH LATIN CAPITAL LETTER P → LATIN CAPITAL LETTER P # →Р→ 2119 ; 0050 ; MA # ( ℙ → P ) DOUBLE-STRUCK CAPITAL P → LATIN CAPITAL LETTER P # +1CCE5 ; 0050 ; MA #* ( 𜳥 → P ) OUTLINED LATIN CAPITAL LETTER P → LATIN CAPITAL LETTER P # 1D40F ; 0050 ; MA # ( 𝐏 → P ) MATHEMATICAL BOLD CAPITAL P → LATIN CAPITAL LETTER P # 1D443 ; 0050 ; MA # ( 𝑃 → P ) MATHEMATICAL ITALIC CAPITAL P → LATIN CAPITAL LETTER P # 1D477 ; 0050 ; MA # ( 𝑷 → P ) MATHEMATICAL BOLD ITALIC CAPITAL P → LATIN CAPITAL LETTER P # @@ -3226,7 +3227,6 @@ FF30 ; 0050 ; MA # ( P → P ) FULLWIDTH LATIN CAPITAL LETTER P → LATIN CAPI 146D ; 0050 ; MA # ( ᑭ → P ) CANADIAN SYLLABICS KI → LATIN CAPITAL LETTER P # A4D1 ; 0050 ; MA # ( ꓑ → P ) LISU LETTER PA → LATIN CAPITAL LETTER P # 10295 ; 0050 ; MA # ( 𐊕 → P ) LYCIAN LETTER R → LATIN CAPITAL LETTER P # -1CCE5 ; 0050 ; MA #* ( 𜳥 → P ) OUTLINED LATIN CAPITAL LETTER P → LATIN CAPITAL LETTER P # 01A5 ; 0070 0314 ; MA # ( ƥ → p̔ ) LATIN SMALL LETTER P WITH HOOK → LATIN SMALL LETTER P, COMBINING REVERSED COMMA ABOVE # @@ -3272,6 +3272,7 @@ ABB2 ; 1D18 ; MA # ( ꮲ → ᴘ ) CHEROKEE SMALL LETTER TLV → LATIN LETTER SM 0566 ; 0071 ; MA # ( զ → q ) ARMENIAN SMALL LETTER ZA → LATIN SMALL LETTER Q # 211A ; 0051 ; MA # ( ℚ → Q ) DOUBLE-STRUCK CAPITAL Q → LATIN CAPITAL LETTER Q # +1CCE6 ; 0051 ; MA #* ( 𜳦 → Q ) OUTLINED LATIN CAPITAL LETTER Q → LATIN CAPITAL LETTER Q # 1D410 ; 0051 ; MA # ( 𝐐 → Q ) MATHEMATICAL BOLD CAPITAL Q → LATIN CAPITAL LETTER Q # 1D444 ; 0051 ; MA # ( 𝑄 → Q ) MATHEMATICAL ITALIC CAPITAL Q → LATIN CAPITAL LETTER Q # 1D478 ; 0051 ; MA # ( 𝑸 → Q ) MATHEMATICAL BOLD ITALIC CAPITAL Q → LATIN CAPITAL LETTER Q # @@ -3285,7 +3286,6 @@ ABB2 ; 1D18 ; MA # ( ꮲ → ᴘ ) CHEROKEE SMALL LETTER TLV → LATIN LETTER SM 1D64C ; 0051 ; MA # ( 𝙌 → Q ) MATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL Q → LATIN CAPITAL LETTER Q # 1D680 ; 0051 ; MA # ( 𝚀 → Q ) MATHEMATICAL MONOSPACE CAPITAL Q → LATIN CAPITAL LETTER Q # 2D55 ; 0051 ; MA # ( ⵕ → Q ) TIFINAGH LETTER YARR → LATIN CAPITAL LETTER Q # -1CCE6 ; 0051 ; MA #* ( 𜳦 → Q ) OUTLINED LATIN CAPITAL LETTER Q → LATIN CAPITAL LETTER Q # 02A0 ; 0071 0314 ; MA # ( ʠ → q̔ ) LATIN SMALL LETTER Q WITH HOOK → LATIN SMALL LETTER Q, COMBINING REVERSED COMMA ABOVE # @@ -3338,6 +3338,7 @@ AB81 ; 0072 ; MA # ( ꮁ → r ) CHEROKEE SMALL LETTER HU → LATIN SMALL LETTER 211B ; 0052 ; MA # ( ℛ → R ) SCRIPT CAPITAL R → LATIN CAPITAL LETTER R # 211C ; 0052 ; MA # ( ℜ → R ) BLACK-LETTER CAPITAL R → LATIN CAPITAL LETTER R # 211D ; 0052 ; MA # ( ℝ → R ) DOUBLE-STRUCK CAPITAL R → LATIN CAPITAL LETTER R # +1CCE7 ; 0052 ; MA #* ( 𜳧 → R ) OUTLINED LATIN CAPITAL LETTER R → LATIN CAPITAL LETTER R # 1D411 ; 0052 ; MA # ( 𝐑 → R ) MATHEMATICAL BOLD CAPITAL R → LATIN CAPITAL LETTER R # 1D445 ; 0052 ; MA # ( 𝑅 → R ) MATHEMATICAL ITALIC CAPITAL R → LATIN CAPITAL LETTER R # 1D479 ; 0052 ; MA # ( 𝑹 → R ) MATHEMATICAL BOLD ITALIC CAPITAL R → LATIN CAPITAL LETTER R # @@ -3355,7 +3356,6 @@ AB81 ; 0072 ; MA # ( ꮁ → r ) CHEROKEE SMALL LETTER HU → LATIN SMALL LETTER 1587 ; 0052 ; MA # ( ᖇ → R ) CANADIAN SYLLABICS TLHI → LATIN CAPITAL LETTER R # A4E3 ; 0052 ; MA # ( ꓣ → R ) LISU LETTER ZHA → LATIN CAPITAL LETTER R # 16F35 ; 0052 ; MA # ( 𖼵 → R ) MIAO LETTER ZHA → LATIN CAPITAL LETTER R # -1CCE7 ; 0052 ; MA #* ( 𜳧 → R ) OUTLINED LATIN CAPITAL LETTER R → LATIN CAPITAL LETTER R # 027D ; 0072 0328 ; MA # ( ɽ → r̨ ) LATIN SMALL LETTER R WITH TAIL → LATIN SMALL LETTER R, COMBINING OGONEK # @@ -3425,6 +3425,7 @@ ABAA ; 0073 ; MA # ( ꮪ → s ) CHEROKEE SMALL LETTER DU → LATIN SMALL LETTER 10448 ; 0073 ; MA # ( 𐑈 → s ) DESERET SMALL LETTER ZHEE → LATIN SMALL LETTER S # FF33 ; 0053 ; MA # ( S → S ) FULLWIDTH LATIN CAPITAL LETTER S → LATIN CAPITAL LETTER S # →Ѕ→ +1CCE8 ; 0053 ; MA #* ( 𜳨 → S ) OUTLINED LATIN CAPITAL LETTER S → LATIN CAPITAL LETTER S # 1D412 ; 0053 ; MA # ( 𝐒 → S ) MATHEMATICAL BOLD CAPITAL S → LATIN CAPITAL LETTER S # 1D446 ; 0053 ; MA # ( 𝑆 → S ) MATHEMATICAL ITALIC CAPITAL S → LATIN CAPITAL LETTER S # 1D47A ; 0053 ; MA # ( 𝑺 → S ) MATHEMATICAL BOLD ITALIC CAPITAL S → LATIN CAPITAL LETTER S # @@ -3446,13 +3447,14 @@ A4E2 ; 0053 ; MA # ( ꓢ → S ) LISU LETTER SA → LATIN CAPITAL LETTER S # 16F3A ; 0053 ; MA # ( 𖼺 → S ) MIAO LETTER SA → LATIN CAPITAL LETTER S # 10296 ; 0053 ; MA # ( 𐊖 → S ) LYCIAN LETTER S → LATIN CAPITAL LETTER S # 10420 ; 0053 ; MA # ( 𐐠 → S ) DESERET CAPITAL LETTER ZHEE → LATIN CAPITAL LETTER S # -1CCE8 ; 0053 ; MA #* ( 𜳨 → S ) OUTLINED LATIN CAPITAL LETTER S → LATIN CAPITAL LETTER S # 0282 ; 0073 0328 ; MA # ( ʂ → s̨ ) LATIN SMALL LETTER S WITH HOOK → LATIN SMALL LETTER S, COMBINING OGONEK # 1D74 ; 0073 0334 ; MA # ( ᵴ → s̴ ) LATIN SMALL LETTER S WITH MIDDLE TILDE → LATIN SMALL LETTER S, COMBINING TILDE OVERLAY # A7B5 ; 00DF ; MA # ( ꞵ → ß ) LATIN SMALL LETTER BETA → LATIN SMALL LETTER SHARP S # →β→ +1E9E ; 00DF ; MA # ( ẞ → ß ) LATIN CAPITAL LETTER SHARP S → LATIN SMALL LETTER SHARP S # +A7D6 ; 00DF ; MA # ( Ꟗ → ß ) LATIN CAPITAL LETTER MIDDLE SCOTS S → LATIN SMALL LETTER SHARP S # →β→ 03B2 ; 00DF ; MA # ( β → ß ) GREEK SMALL LETTER BETA → LATIN SMALL LETTER SHARP S # 03D0 ; 00DF ; MA # ( ϐ → ß ) GREEK BETA SYMBOL → LATIN SMALL LETTER SHARP S # →β→ 1D6C3 ; 00DF ; MA # ( 𝛃 → ß ) MATHEMATICAL BOLD SMALL BETA → LATIN SMALL LETTER SHARP S # →β→ @@ -3503,6 +3505,7 @@ AB4D ; 0283 ; MA # ( ꭍ → ʃ ) LATIN SMALL LETTER BASELINE ESH → LATIN SMAL 27D9 ; 0054 ; MA #* ( ⟙ → T ) LARGE DOWN TACK → LATIN CAPITAL LETTER T # 1F768 ; 0054 ; MA #* ( 🝨 → T ) ALCHEMICAL SYMBOL FOR CRUCIBLE-4 → LATIN CAPITAL LETTER T # FF34 ; 0054 ; MA # ( T → T ) FULLWIDTH LATIN CAPITAL LETTER T → LATIN CAPITAL LETTER T # →Т→ +1CCE9 ; 0054 ; MA #* ( 𜳩 → T ) OUTLINED LATIN CAPITAL LETTER T → LATIN CAPITAL LETTER T # 1D413 ; 0054 ; MA # ( 𝐓 → T ) MATHEMATICAL BOLD CAPITAL T → LATIN CAPITAL LETTER T # 1D447 ; 0054 ; MA # ( 𝑇 → T ) MATHEMATICAL ITALIC CAPITAL T → LATIN CAPITAL LETTER T # 1D47B ; 0054 ; MA # ( 𝑻 → T ) MATHEMATICAL BOLD ITALIC CAPITAL T → LATIN CAPITAL LETTER T # @@ -3531,7 +3534,6 @@ A4D4 ; 0054 ; MA # ( ꓔ → T ) LISU LETTER TA → LATIN CAPITAL LETTER T # 10297 ; 0054 ; MA # ( 𐊗 → T ) LYCIAN LETTER T → LATIN CAPITAL LETTER T # 102B1 ; 0054 ; MA # ( 𐊱 → T ) CARIAN LETTER C-18 → LATIN CAPITAL LETTER T # 10315 ; 0054 ; MA # ( 𐌕 → T ) OLD ITALIC LETTER TE → LATIN CAPITAL LETTER T # -1CCE9 ; 0054 ; MA #* ( 𜳩 → T ) OUTLINED LATIN CAPITAL LETTER T → LATIN CAPITAL LETTER T # 01AD ; 0074 0314 ; MA # ( ƭ → t̔ ) LATIN SMALL LETTER T WITH HOOK → LATIN SMALL LETTER T, COMBINING REVERSED COMMA ABOVE # @@ -3614,6 +3616,7 @@ AB52 ; 0075 ; MA # ( ꭒ → u ) LATIN SMALL LETTER U WITH LEFT HOOK → LATIN S 222A ; 0055 ; MA #* ( ∪ → U ) UNION → LATIN CAPITAL LETTER U # →ᑌ→ 22C3 ; 0055 ; MA #* ( ⋃ → U ) N-ARY UNION → LATIN CAPITAL LETTER U # →∪→→ᑌ→ +1CCEA ; 0055 ; MA #* ( 𜳪 → U ) OUTLINED LATIN CAPITAL LETTER U → LATIN CAPITAL LETTER U # 1D414 ; 0055 ; MA # ( 𝐔 → U ) MATHEMATICAL BOLD CAPITAL U → LATIN CAPITAL LETTER U # 1D448 ; 0055 ; MA # ( 𝑈 → U ) MATHEMATICAL ITALIC CAPITAL U → LATIN CAPITAL LETTER U # 1D47C ; 0055 ; MA # ( 𝑼 → U ) MATHEMATICAL BOLD ITALIC CAPITAL U → LATIN CAPITAL LETTER U # @@ -3634,7 +3637,6 @@ AB52 ; 0075 ; MA # ( ꭒ → u ) LATIN SMALL LETTER U WITH LEFT HOOK → LATIN S A4F4 ; 0055 ; MA # ( ꓴ → U ) LISU LETTER U → LATIN CAPITAL LETTER U # 16F42 ; 0055 ; MA # ( 𖽂 → U ) MIAO LETTER WA → LATIN CAPITAL LETTER U # 118B8 ; 0055 ; MA # ( 𑢸 → U ) WARANG CITI CAPITAL LETTER PU → LATIN CAPITAL LETTER U # -1CCEA ; 0055 ; MA #* ( 𜳪 → U ) OUTLINED LATIN CAPITAL LETTER U → LATIN CAPITAL LETTER U # 01D4 ; 016D ; MA # ( ǔ → ŭ ) LATIN SMALL LETTER U WITH CARON → LATIN SMALL LETTER U WITH BREVE # @@ -3699,6 +3701,7 @@ ABA9 ; 0076 ; MA # ( ꮩ → v ) CHEROKEE SMALL LETTER DO → LATIN SMALL LETTER 0667 ; 0056 ; MA # ( ‎٧‎ → V ) ARABIC-INDIC DIGIT SEVEN → LATIN CAPITAL LETTER V # 06F7 ; 0056 ; MA # ( ۷ → V ) EXTENDED ARABIC-INDIC DIGIT SEVEN → LATIN CAPITAL LETTER V # →‎٧‎→ 2164 ; 0056 ; MA # ( Ⅴ → V ) ROMAN NUMERAL FIVE → LATIN CAPITAL LETTER V # +1CCEB ; 0056 ; MA #* ( 𜳫 → V ) OUTLINED LATIN CAPITAL LETTER V → LATIN CAPITAL LETTER V # 1D415 ; 0056 ; MA # ( 𝐕 → V ) MATHEMATICAL BOLD CAPITAL V → LATIN CAPITAL LETTER V # 1D449 ; 0056 ; MA # ( 𝑉 → V ) MATHEMATICAL ITALIC CAPITAL V → LATIN CAPITAL LETTER V # 1D47D ; 0056 ; MA # ( 𝑽 → V ) MATHEMATICAL BOLD ITALIC CAPITAL V → LATIN CAPITAL LETTER V # @@ -3721,7 +3724,6 @@ A4E6 ; 0056 ; MA # ( ꓦ → V ) LISU LETTER HA → LATIN CAPITAL LETTER V # 16F08 ; 0056 ; MA # ( 𖼈 → V ) MIAO LETTER VA → LATIN CAPITAL LETTER V # 118A0 ; 0056 ; MA # ( 𑢠 → V ) WARANG CITI CAPITAL LETTER NGAA → LATIN CAPITAL LETTER V # 1051D ; 0056 ; MA # ( 𐔝 → V ) ELBASAN LETTER TE → LATIN CAPITAL LETTER V # -1CCEB ; 0056 ; MA #* ( 𜳫 → V ) OUTLINED LATIN CAPITAL LETTER V → LATIN CAPITAL LETTER V # 10197 ; 0056 0335 ; MA #* ( 𐆗 → V̵ ) ROMAN QUINARIUS SIGN → LATIN CAPITAL LETTER V, COMBINING SHORT STROKE OVERLAY # →V̶→ @@ -3748,6 +3750,7 @@ A4E6 ; 0056 ; MA # ( ꓦ → V ) LISU LETTER HA → LATIN CAPITAL LETTER V # 0668 ; 0245 ; MA # ( ‎٨‎ → Ʌ ) ARABIC-INDIC DIGIT EIGHT → LATIN CAPITAL LETTER TURNED V # →Λ→ 06F8 ; 0245 ; MA # ( ۸ → Ʌ ) EXTENDED ARABIC-INDIC DIGIT EIGHT → LATIN CAPITAL LETTER TURNED V # →‎٨‎→→Λ→ +A7DA ; 0245 ; MA # ( Ꟛ → Ʌ ) LATIN CAPITAL LETTER LAMBDA → LATIN CAPITAL LETTER TURNED V # →Λ→ 039B ; 0245 ; MA # ( Λ → Ʌ ) GREEK CAPITAL LETTER LAMDA → LATIN CAPITAL LETTER TURNED V # 1D6B2 ; 0245 ; MA # ( 𝚲 → Ʌ ) MATHEMATICAL BOLD CAPITAL LAMDA → LATIN CAPITAL LETTER TURNED V # →Λ→ 1D6EC ; 0245 ; MA # ( 𝛬 → Ʌ ) MATHEMATICAL ITALIC CAPITAL LAMDA → LATIN CAPITAL LETTER TURNED V # →Λ→ @@ -3763,6 +3766,8 @@ A4E5 ; 0245 ; MA # ( ꓥ → Ʌ ) LISU LETTER NGA → LATIN CAPITAL LETTER TURNE 16F3D ; 0245 ; MA # ( 𖼽 → Ʌ ) MIAO LETTER ZZA → LATIN CAPITAL LETTER TURNED V # 1028D ; 0245 ; MA # ( 𐊍 → Ʌ ) LYCIAN LETTER L → LATIN CAPITAL LETTER TURNED V # →Λ→ +A7DC ; 0245 0338 ; MA # ( Ƛ → Ʌ̸ ) LATIN CAPITAL LETTER LAMBDA WITH STROKE → LATIN CAPITAL LETTER TURNED V, COMBINING LONG SOLIDUS OVERLAY # →Λ̷→ + 04C5 ; 0245 0326 ; MA # ( Ӆ → Ʌ̦ ) CYRILLIC CAPITAL LETTER EL WITH TAIL → LATIN CAPITAL LETTER TURNED V, COMBINING COMMA BELOW # →Л̡→ 143D ; 0245 00B7 ; MA # ( ᐽ → Ʌ· ) CANADIAN SYLLABICS WEST-CREE PWI → LATIN CAPITAL LETTER TURNED V, MIDDLE DOT # →ᐱᐧ→→ᐱ·→ @@ -3792,6 +3797,7 @@ AB83 ; 0077 ; MA # ( ꮃ → w ) CHEROKEE SMALL LETTER LA → LATIN SMALL LETTER 118EF ; 0057 ; MA #* ( 𑣯 → W ) WARANG CITI NUMBER SIXTY → LATIN CAPITAL LETTER W # 118E6 ; 0057 ; MA # ( 𑣦 → W ) WARANG CITI DIGIT SIX → LATIN CAPITAL LETTER W # +1CCEC ; 0057 ; MA #* ( 𜳬 → W ) OUTLINED LATIN CAPITAL LETTER W → LATIN CAPITAL LETTER W # 1D416 ; 0057 ; MA # ( 𝐖 → W ) MATHEMATICAL BOLD CAPITAL W → LATIN CAPITAL LETTER W # 1D44A ; 0057 ; MA # ( 𝑊 → W ) MATHEMATICAL ITALIC CAPITAL W → LATIN CAPITAL LETTER W # 1D47E ; 0057 ; MA # ( 𝑾 → W ) MATHEMATICAL BOLD ITALIC CAPITAL W → LATIN CAPITAL LETTER W # @@ -3809,7 +3815,6 @@ AB83 ; 0077 ; MA # ( ꮃ → w ) CHEROKEE SMALL LETTER LA → LATIN SMALL LETTER 13B3 ; 0057 ; MA # ( Ꮃ → W ) CHEROKEE LETTER LA → LATIN CAPITAL LETTER W # 13D4 ; 0057 ; MA # ( Ꮤ → W ) CHEROKEE LETTER TA → LATIN CAPITAL LETTER W # A4EA ; 0057 ; MA # ( ꓪ → W ) LISU LETTER WA → LATIN CAPITAL LETTER W # -1CCEC ; 0057 ; MA #* ( 𜳬 → W ) OUTLINED LATIN CAPITAL LETTER W → LATIN CAPITAL LETTER W # 047D ; 0077 0486 0487 ; MA # ( ѽ → w҆҇ ) CYRILLIC SMALL LETTER OMEGA WITH TITLO → LATIN SMALL LETTER W, COMBINING CYRILLIC PSILI PNEUMATA, COMBINING CYRILLIC POKRYTIE # →ѡ҆҇→ @@ -3857,6 +3862,7 @@ FF58 ; 0078 ; MA # ( x → x ) FULLWIDTH LATIN SMALL LETTER X → LATIN SMALL 118EC ; 0058 ; MA #* ( 𑣬 → X ) WARANG CITI NUMBER THIRTY → LATIN CAPITAL LETTER X # FF38 ; 0058 ; MA # ( X → X ) FULLWIDTH LATIN CAPITAL LETTER X → LATIN CAPITAL LETTER X # →Х→ 2169 ; 0058 ; MA # ( Ⅹ → X ) ROMAN NUMERAL TEN → LATIN CAPITAL LETTER X # +1CCED ; 0058 ; MA #* ( 𜳭 → X ) OUTLINED LATIN CAPITAL LETTER X → LATIN CAPITAL LETTER X # 1D417 ; 0058 ; MA # ( 𝐗 → X ) MATHEMATICAL BOLD CAPITAL X → LATIN CAPITAL LETTER X # 1D44B ; 0058 ; MA # ( 𝑋 → X ) MATHEMATICAL ITALIC CAPITAL X → LATIN CAPITAL LETTER X # 1D47F ; 0058 ; MA # ( 𝑿 → X ) MATHEMATICAL BOLD ITALIC CAPITAL X → LATIN CAPITAL LETTER X # @@ -3886,7 +3892,6 @@ A4EB ; 0058 ; MA # ( ꓫ → X ) LISU LETTER SHA → LATIN CAPITAL LETTER X # 102B4 ; 0058 ; MA # ( 𐊴 → X ) CARIAN LETTER X → LATIN CAPITAL LETTER X # 10317 ; 0058 ; MA # ( 𐌗 → X ) OLD ITALIC LETTER EKS → LATIN CAPITAL LETTER X # 10527 ; 0058 ; MA # ( 𐔧 → X ) ELBASAN LETTER KHE → LATIN CAPITAL LETTER X # -1CCED ; 0058 ; MA #* ( 𜳭 → X ) OUTLINED LATIN CAPITAL LETTER X → LATIN CAPITAL LETTER X # 2A30 ; 0078 0307 ; MA #* ( ⨰ → ẋ ) MULTIPLICATION SIGN WITH DOT ABOVE → LATIN SMALL LETTER X, COMBINING DOT ABOVE # →×̇→ @@ -3934,6 +3939,7 @@ AB5A ; 0079 ; MA # ( ꭚ → y ) LATIN SMALL LETTER Y WITH SHORT RIGHT LEG → L 118DC ; 0079 ; MA # ( 𑣜 → y ) WARANG CITI SMALL LETTER HAR → LATIN SMALL LETTER Y # →ɣ→→γ→ FF39 ; 0059 ; MA # ( Y → Y ) FULLWIDTH LATIN CAPITAL LETTER Y → LATIN CAPITAL LETTER Y # →Υ→ +1CCEE ; 0059 ; MA #* ( 𜳮 → Y ) OUTLINED LATIN CAPITAL LETTER Y → LATIN CAPITAL LETTER Y # 1D418 ; 0059 ; MA # ( 𝐘 → Y ) MATHEMATICAL BOLD CAPITAL Y → LATIN CAPITAL LETTER Y # 1D44C ; 0059 ; MA # ( 𝑌 → Y ) MATHEMATICAL ITALIC CAPITAL Y → LATIN CAPITAL LETTER Y # 1D480 ; 0059 ; MA # ( 𝒀 → Y ) MATHEMATICAL BOLD ITALIC CAPITAL Y → LATIN CAPITAL LETTER Y # @@ -3963,7 +3969,6 @@ A4EC ; 0059 ; MA # ( ꓬ → Y ) LISU LETTER YA → LATIN CAPITAL LETTER Y # 16F43 ; 0059 ; MA # ( 𖽃 → Y ) MIAO LETTER AH → LATIN CAPITAL LETTER Y # 118A4 ; 0059 ; MA # ( 𑢤 → Y ) WARANG CITI CAPITAL LETTER YA → LATIN CAPITAL LETTER Y # 102B2 ; 0059 ; MA # ( 𐊲 → Y ) CARIAN LETTER U → LATIN CAPITAL LETTER Y # -1CCEE ; 0059 ; MA #* ( 𜳮 → Y ) OUTLINED LATIN CAPITAL LETTER Y → LATIN CAPITAL LETTER Y # 01B4 ; 0079 0314 ; MA # ( ƴ → y̔ ) LATIN SMALL LETTER Y WITH HOOK → LATIN SMALL LETTER Y, COMBINING REVERSED COMMA ABOVE # @@ -4002,6 +4007,7 @@ AB93 ; 007A ; MA # ( ꮓ → z ) CHEROKEE SMALL LETTER NO → LATIN SMALL LETTER FF3A ; 005A ; MA # ( Z → Z ) FULLWIDTH LATIN CAPITAL LETTER Z → LATIN CAPITAL LETTER Z # →Ζ→ 2124 ; 005A ; MA # ( ℤ → Z ) DOUBLE-STRUCK CAPITAL Z → LATIN CAPITAL LETTER Z # 2128 ; 005A ; MA # ( ℨ → Z ) BLACK-LETTER CAPITAL Z → LATIN CAPITAL LETTER Z # +1CCEF ; 005A ; MA #* ( 𜳯 → Z ) OUTLINED LATIN CAPITAL LETTER Z → LATIN CAPITAL LETTER Z # 1D419 ; 005A ; MA # ( 𝐙 → Z ) MATHEMATICAL BOLD CAPITAL Z → LATIN CAPITAL LETTER Z # 1D44D ; 005A ; MA # ( 𝑍 → Z ) MATHEMATICAL ITALIC CAPITAL Z → LATIN CAPITAL LETTER Z # 1D481 ; 005A ; MA # ( 𝒁 → Z ) MATHEMATICAL BOLD ITALIC CAPITAL Z → LATIN CAPITAL LETTER Z # @@ -4022,7 +4028,6 @@ FF3A ; 005A ; MA # ( Z → Z ) FULLWIDTH LATIN CAPITAL LETTER Z → LATIN CAPI 13C3 ; 005A ; MA # ( Ꮓ → Z ) CHEROKEE LETTER NO → LATIN CAPITAL LETTER Z # A4DC ; 005A ; MA # ( ꓜ → Z ) LISU LETTER DZA → LATIN CAPITAL LETTER Z # 118A9 ; 005A ; MA # ( 𑢩 → Z ) WARANG CITI CAPITAL LETTER O → LATIN CAPITAL LETTER Z # -1CCEF ; 005A ; MA #* ( 𜳯 → Z ) OUTLINED LATIN CAPITAL LETTER Z → LATIN CAPITAL LETTER Z # 0290 ; 007A 0328 ; MA # ( ʐ → z̨ ) LATIN SMALL LETTER Z WITH RETROFLEX HOOK → LATIN SMALL LETTER Z, COMBINING OGONEK # →z̢→ @@ -4115,6 +4120,7 @@ A668 ; 0298 ; MA # ( Ꙩ → ʘ ) CYRILLIC CAPITAL LETTER MONOCULAR O → LATIN 2CE4 ; 03D7 ; MA # ( ⳤ → ϗ ) COPTIC SYMBOL KAI → GREEK KAI SYMBOL # +A7DB ; 03BB ; MA # ( ꟛ → λ ) LATIN SMALL LETTER LAMBDA → GREEK SMALL LETTER LAMDA # 1D6CC ; 03BB ; MA # ( 𝛌 → λ ) MATHEMATICAL BOLD SMALL LAMDA → GREEK SMALL LETTER LAMDA # 1D706 ; 03BB ; MA # ( 𝜆 → λ ) MATHEMATICAL ITALIC SMALL LAMDA → GREEK SMALL LETTER LAMDA # 1D740 ; 03BB ; MA # ( 𝝀 → λ ) MATHEMATICAL BOLD ITALIC SMALL LAMDA → GREEK SMALL LETTER LAMDA # @@ -4123,6 +4129,8 @@ A668 ; 0298 ; MA # ( Ꙩ → ʘ ) CYRILLIC CAPITAL LETTER MONOCULAR O → LATIN 2C96 ; 03BB ; MA # ( Ⲗ → λ ) COPTIC CAPITAL LETTER LAULA → GREEK SMALL LETTER LAMDA # 104DB ; 03BB ; MA # ( 𐓛 → λ ) OSAGE SMALL LETTER AH → GREEK SMALL LETTER LAMDA # +019B ; 03BB 0338 ; MA # ( ƛ → λ̸ ) LATIN SMALL LETTER LAMBDA WITH STROKE → GREEK SMALL LETTER LAMDA, COMBINING LONG SOLIDUS OVERLAY # →λ̷→ + 00B5 ; 03BC ; MA # ( µ → μ ) MICRO SIGN → GREEK SMALL LETTER MU # 1D6CD ; 03BC ; MA # ( 𝛍 → μ ) MATHEMATICAL BOLD SMALL MU → GREEK SMALL LETTER MU # 1D707 ; 03BC ; MA # ( 𝜇 → μ ) MATHEMATICAL ITALIC SMALL MU → GREEK SMALL LETTER MU # @@ -5982,6 +5990,8 @@ FE19 ; 2D57 ; MA #* ( ︙ → ⵗ ) PRESENTATION FORM FOR VERTICAL HORIZONTAL EL 11CB2 ; 11CAA ; MA # ( 𑲲 → 𑲪 ) MARCHEN VOWEL SIGN U → MARCHEN SUBJOINED LETTER RA # +1734 ; 1715 ; MA # ( ᜴ → ᜕ ) HANUNOO SIGN PAMUDPOD → TAGALOG SIGN PAMUDPOD # + 1081 ; 1002 103E ; MA # ( ႁ → ဂှ ) MYANMAR LETTER SHAN HA → MYANMAR LETTER GA, MYANMAR CONSONANT SIGN MEDIAL HA # 1000 ; 1002 102C ; MA # ( က → ဂာ ) MYANMAR LETTER KA → MYANMAR LETTER GA, MYANMAR VOWEL SIGN AA # @@ -7221,6 +7231,7 @@ FA31 ; 50E7 ; MA # ( 僧 → 僧 ) CJK COMPATIBILITY IDEOGRAPH-FA31 → CJK UNIF 2F80C ; 349E ; MA # ( 㒞 → 㒞 ) CJK COMPATIBILITY IDEOGRAPH-2F80C → CJK UNIFIED IDEOGRAPH-349E # +3126 ; 513F ; MA # ( ㄦ → 儿 ) BOPOMOFO LETTER ER → CJK UNIFIED IDEOGRAPH-513F # 2F09 ; 513F ; MA #* ( ⼉ → 儿 ) KANGXI RADICAL LEGS → CJK UNIFIED IDEOGRAPH-513F # FA0C ; 5140 ; MA # ( 兀 → 兀 ) CJK COMPATIBILITY IDEOGRAPH-FA0C → CJK UNIFIED IDEOGRAPH-5140 # @@ -7936,16 +7947,16 @@ FA8D ; 63C4 ; MA # ( 揄 → 揄 ) CJK COMPATIBILITY IDEOGRAPH-FA8D → CJK UNIF 2F8BD ; 63E4 ; MA # ( 揤 → 揤 ) CJK COMPATIBILITY IDEOGRAPH-2F8BD → CJK UNIFIED IDEOGRAPH-63E4 # -FA8F ; 6452 ; MA # ( 摒 → 摒 ) CJK COMPATIBILITY IDEOGRAPH-FA8F → CJK UNIFIED IDEOGRAPH-6452 # +FA8E ; 641C ; MA # ( 搜 → 搜 ) CJK COMPATIBILITY IDEOGRAPH-FA8E → CJK UNIFIED IDEOGRAPH-641C # 2F8BE ; 22BF1 ; MA # ( 𢯱 → 𢯱 ) CJK COMPATIBILITY IDEOGRAPH-2F8BE → CJK UNIFIED IDEOGRAPH-22BF1 # -FA8E ; 641C ; MA # ( 搜 → 搜 ) CJK COMPATIBILITY IDEOGRAPH-FA8E → CJK UNIFIED IDEOGRAPH-641C # - 2F8BF ; 6422 ; MA # ( 搢 → 搢 ) CJK COMPATIBILITY IDEOGRAPH-2F8BF → CJK UNIFIED IDEOGRAPH-6422 # 2F8C0 ; 63C5 ; MA # ( 揅 → 揅 ) CJK COMPATIBILITY IDEOGRAPH-2F8C0 → CJK UNIFIED IDEOGRAPH-63C5 # +FA8F ; 6452 ; MA # ( 摒 → 摒 ) CJK COMPATIBILITY IDEOGRAPH-FA8F → CJK UNIFIED IDEOGRAPH-6452 # + 2F8C3 ; 6469 ; MA # ( 摩 → 摩 ) CJK COMPATIBILITY IDEOGRAPH-2F8C3 → CJK UNIFIED IDEOGRAPH-6469 # 2F8C6 ; 6477 ; MA # ( 摷 → 摷 ) CJK COMPATIBILITY IDEOGRAPH-2F8C6 → CJK UNIFIED IDEOGRAPH-6477 # @@ -9670,5 +9681,5 @@ FACE ; 9F9C ; MA # ( 龜 → 龜 ) CJK COMPATIBILITY IDEOGRAPH-FACE → CJK UNIF 2FD5 ; 9FA0 ; MA #* ( ⿕ → 龠 ) KANGXI RADICAL FLUTE → CJK UNIFIED IDEOGRAPH-9FA0 # -# total: 6347 +# total: 6355 diff --git a/unicodetools/data/security/dev/confusablesSummary.txt b/unicodetools/data/security/dev/confusablesSummary.txt index a35691149..093100c4b 100644 --- a/unicodetools/data/security/dev/confusablesSummary.txt +++ b/unicodetools/data/security/dev/confusablesSummary.txt @@ -1,5 +1,5 @@ # confusablesSummary.txt -# Date: 2024-05-03, 03:26:41 GMT +# Date: 2024-05-31, 21:12:55 GMT # © 2024 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use and license, see https://www.unicode.org/terms_of_use.html @@ -91,7 +91,7 @@ ← (‎ ʽ ‎) 02BD MODIFIER LETTER REVERSED COMMA # →‘→ ← (‎ ʾ ‎) 02BE MODIFIER LETTER RIGHT HALF RING # →ʼ→→′→ ← (‎ ˈ ‎) 02C8 MODIFIER LETTER VERTICAL LINE -← (‎ ˊ ‎) 02CA MODIFIER LETTER ACUTE ACCENT # →ʹ→→′→ +← (‎ ˊ ‎) 02CA MODIFIER LETTER ACUTE ACCENT # →΄→→ʹ→ ← (‎ ˋ ‎) 02CB MODIFIER LETTER GRAVE ACCENT # →`→→‘→ ← (‎ ߴ ‎) 07F4 NKO HIGH TONE APOSTROPHE # →’→ ← (‎ ߵ ‎) 07F5 NKO LOW TONE APOSTROPHE # →‘→ @@ -4925,8 +4925,10 @@ ← (‎ Ϸ ‎) 03F7 GREEK CAPITAL LETTER SHO ← (‎ 𐓄 ‎) 104C4 OSAGE CAPITAL LETTER PA -# ß β Ᏸ ꞵ ϐ 𝛃 𝛽 𝜷 𝝱 𝞫 +# ß Ꟗ ẞ β Ᏸ ꞵ ϐ 𝛃 𝛽 𝜷 𝝱 𝞫 (‎ ß ‎) 00DF LATIN SMALL LETTER SHARP S +← (‎ Ꟗ ‎) A7D6 LATIN CAPITAL LETTER MIDDLE SCOTS S # →β→ +← (‎ ẞ ‎) 1E9E LATIN CAPITAL LETTER SHARP S ← (‎ β ‎) 03B2 GREEK SMALL LETTER BETA ← (‎ Ᏸ ‎) 13F0 CHEROKEE LETTER YE # →β→ ← (‎ ꞵ ‎) A7B5 LATIN SMALL LETTER BETA # →β→ @@ -5095,6 +5097,11 @@ ← (‎ 𝈡 ‎) 1D221 GREEK INSTRUMENTAL NOTATION SYMBOL-7 ← (‎ ℇ ‎) 2107 EULER CONSTANT +# λ̸ λ̷ ƛ + (‎ ƛ ‎) 019B LATIN SMALL LETTER LAMBDA WITH STROKE +← (‎ λ̸ ‎) 03BB 0338 GREEK SMALL LETTER LAMDA, COMBINING LONG SOLIDUS OVERLAY # →λ̷→ +← (‎ λ̷ ‎) 03BB 0337 GREEK SMALL LETTER LAMDA, COMBINING SHORT SOLIDUS OVERLAY + # ƨ ᴤ ϩ ꙅ (‎ ƨ ‎) 01A8 LATIN SMALL LETTER TONE TWO ← (‎ ᴤ ‎) 1D24 LATIN LETTER VOICED LARYNGEAL SPIRANT @@ -5165,8 +5172,9 @@ (‎ ɂ ‎) 0242 LATIN SMALL LETTER GLOTTAL STOP ← (‎ ꭾ ‎) AB7E CHEROKEE SMALL LETTER HE -# Ʌ ٨ ۸ Λ Л ᐱ ⴷ ꓥ ꛎ 𐊍 𖼽 𐒰 𝚲 𝛬 𝜦 𝝠 𝞚 +# Ʌ Ꟛ ٨ ۸ Λ Л ᐱ ⴷ ꓥ ꛎ 𐊍 𖼽 𐒰 𝚲 𝛬 𝜦 𝝠 𝞚 (‎ Ʌ ‎) 0245 LATIN CAPITAL LETTER TURNED V +← (‎ Ꟛ ‎) A7DA LATIN CAPITAL LETTER LAMBDA # →Λ→ ← (‎ ٨ ‎) 0668 ARABIC-INDIC DIGIT EIGHT # →Λ→ ← (‎ ۸ ‎) 06F8 EXTENDED ARABIC-INDIC DIGIT EIGHT # →‎٨‎→→Λ→ ← (‎ Λ ‎) 039B GREEK CAPITAL LETTER LAMDA @@ -5197,6 +5205,13 @@ ← (‎ Л̡ ‎) 041B 0321 CYRILLIC CAPITAL LETTER EL, COMBINING PALATALIZED HOOK BELOW ← (‎ Ӆ ‎) 04C5 CYRILLIC CAPITAL LETTER EL WITH TAIL # →Л̡→ +# Ꟛ̸ Ʌ̸ Λ̸ Λ̷ Ƛ + (‎ Ʌ̸ ‎) 0245 0338 LATIN CAPITAL LETTER TURNED V, COMBINING LONG SOLIDUS OVERLAY +← (‎ Ꟛ̸ ‎) A7DA 0338 LATIN CAPITAL LETTER LAMBDA, COMBINING LONG SOLIDUS OVERLAY # →Λ̷→ +← (‎ Λ̸ ‎) 039B 0338 GREEK CAPITAL LETTER LAMDA, COMBINING LONG SOLIDUS OVERLAY # →Λ̷→ +← (‎ Λ̷ ‎) 039B 0337 GREEK CAPITAL LETTER LAMDA, COMBINING SHORT SOLIDUS OVERLAY +← (‎ Ƛ ‎) A7DC LATIN CAPITAL LETTER LAMBDA WITH STROKE # →Λ̷→ + # ɋ ᶐ (‎ ɋ ‎) 024B LATIN SMALL LETTER Q WITH HOOK TAIL ← (‎ ᶐ ‎) 1D90 LATIN SMALL LETTER ALPHA WITH RETROFLEX HOOK @@ -5868,8 +5883,9 @@ ← (‎ 𝝵 ‎) 1D775 MATHEMATICAL SANS-SERIF BOLD SMALL ZETA ← (‎ 𝞯 ‎) 1D7AF MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL ZETA -# λ Ⲗ 𐓛 𝛌 𝜆 𝝀 𝝺 𝞴 +# λ ꟛ Ⲗ 𐓛 𝛌 𝜆 𝝀 𝝺 𝞴 (‎ λ ‎) 03BB GREEK SMALL LETTER LAMDA +← (‎ ꟛ ‎) A7DB LATIN SMALL LETTER LAMBDA ← (‎ Ⲗ ‎) 2C96 COPTIC CAPITAL LETTER LAULA ← (‎ 𐓛 ‎) 104DB OSAGE SMALL LETTER AH ← (‎ 𝛌 ‎) 1D6CC MATHEMATICAL BOLD SMALL LAMDA @@ -11373,6 +11389,10 @@ (‎ ᛯ ‎) 16EF RUNIC TVIMADUR SYMBOL ← (‎ ⵣ ‎) 2D63 TIFINAGH LETTER YAZ +# ᜕ ᜴ + (‎ ᜕ ‎) 1715 TAGALOG SIGN PAMUDPOD +← (‎ ᜴ ‎) 1734 HANUNOO SIGN PAMUDPOD + # អ ឣ (‎ អ ‎) 17A2 KHMER LETTER QA ← (‎ ឣ ‎) 17A3 KHMER INDEPENDENT VOWEL QAQ @@ -12322,9 +12342,10 @@ (‎ ⼈ ‎) 2F08 KANGXI RADICAL MAN ← (‎ 人 ‎) 4EBA CJK UNIFIED IDEOGRAPH-4EBA -# 儿 ⼉ +# 儿 ㄦ ⼉ (‎ ⼉ ‎) 2F09 KANGXI RADICAL LEGS ← (‎ 儿 ‎) 513F CJK UNIFIED IDEOGRAPH-513F +← (‎ ㄦ ‎) 3126 BOPOMOFO LETTER ER # →儿→ # 入 ⼊ (‎ ⼊ ‎) 2F0A KANGXI RADICAL ENTER @@ -17245,5 +17266,5 @@ (‎ 𪘀 ‎) 2A600 CJK UNIFIED IDEOGRAPH-2A600 ← (‎ 𪘀 ‎) 2FA1D CJK COMPATIBILITY IDEOGRAPH-2FA1D -# total : 7290 +# total : 7302 diff --git a/unicodetools/data/security/dev/data/confusablesSummaryIdentifier.txt b/unicodetools/data/security/dev/data/confusablesSummaryIdentifier.txt index 5435b8bff..6bd3611df 100644 --- a/unicodetools/data/security/dev/data/confusablesSummaryIdentifier.txt +++ b/unicodetools/data/security/dev/data/confusablesSummaryIdentifier.txt @@ -1,5 +1,5 @@ # confusablesSummaryIdentifier.txt -# Date: 2024-05-04, 21:31:06 GMT +# Date: 2024-05-31, 21:12:55 GMT # © 2024 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use and license, see https://www.unicode.org/terms_of_use.html @@ -542,8 +542,10 @@ (‎ Ö ‎) 00D6 LATIN CAPITAL LETTER O WITH DIAERESIS ← (‎ Ő ‎) 0150 LATIN CAPITAL LETTER O WITH DOUBLE ACUTE -# ß β +# ß Ꟗ ẞ β (‎ ß ‎) 00DF LATIN SMALL LETTER SHARP S +← (‎ Ꟗ ‎) A7D6 LATIN CAPITAL LETTER MIDDLE SCOTS S # →β→ +← (‎ ẞ ‎) 1E9E LATIN CAPITAL LETTER SHARP S ← (‎ β ‎) 03B2 GREEK SMALL LETTER BETA # å ȧ @@ -1618,6 +1620,10 @@ (‎ 二 ‎) 4E8C CJK UNIFIED IDEOGRAPH-4E8C ← (‎ ニ ‎) 30CB KATAKANA LETTER NI +# 儿 ㄦ + (‎ 儿 ‎) 513F CJK UNIFIED IDEOGRAPH-513F +← (‎ ㄦ ‎) 3126 BOPOMOFO LETTER ER + # 八 ハ (‎ 八 ‎) 516B CJK UNIFIED IDEOGRAPH-516B ← (‎ ハ ‎) 30CF KATAKANA LETTER HA @@ -1839,5 +1845,5 @@ (‎ 鹂 ‎) 9E42 CJK UNIFIED IDEOGRAPH-9E42 ← (‎ 鹃 ‎) 9E43 CJK UNIFIED IDEOGRAPH-9E43 -# total : 635 +# total : 638 diff --git a/unicodetools/data/security/dev/data/source/confusables-source.txt b/unicodetools/data/security/dev/data/source/confusables-source.txt index 65804747e..e4c47a779 100644 --- a/unicodetools/data/security/dev/data/source/confusables-source.txt +++ b/unicodetools/data/security/dev/data/source/confusables-source.txt @@ -1,4 +1,17 @@ -0021 ; 01C3 # ( ! → ǃ) EXCLAMATION MARK → LATIN LETTER RETROFLEX CLICK +# See https://github.com/unicode-org/unicodetools/blob/main/docs/security.md for how to use this file. +# The format is +# Source ; Target ; comments # comments +# Source is: +# - a hex code point +# - a literal character +# - a range of the above with .. (need to check this) +# - a UnicodeSet +# Target is: +# - a hex code point +# - a literal character +# - a sequence of hex code points and or literal characters (they can be mixed) +####### +0021 ; 01C3 # ( ! → ǃ) EXCLAMATION MARK → LATIN LETTER RETROFLEX CLICK 0022 ; 02BA # ( " → ʺ) QUOTATION MARK → MODIFIER LETTER DOUBLE PRIME 0022 ; 0027 0027 0022 ; 05F4 # ( " → ״) QUOTATION MARK → HEBREW PUNCTUATION GERSHAYIM @@ -5437,4 +5450,26 @@ ABBB; 0473; V8_0; ꮻ => ѳ; CHEROKEE SMALL LETTER WI => CYRILLIC SMALL LETTER F 1F16E ; C 20E0 ; V11_0 ; CIRCLED C WITH OVERLAID BACKSLASH # 1F16F ; 🚹 ; V11_0 ; CIRCLED HUMAN FIGURE +# 178-A76 — Section 21 of document L2/24-012 +513F ; 儿 # V16.0 ; U+513F ︎➡︎ U+16FF2 +16FF3 ; 兒 # V16.0 ; U+5152 ➡ U+16FF3 +ㄦ ; 儿 # V16.0 ; U+3126 ㄦ BOPOMOFO LETTER ER ➡ 儿 + +# 176-A116 — Section 2a of L2/23-164 + +A7DA ; Λ # V16.0 ; U+A7DA LATIN CAPITAL LETTER LAMBDA ➡ greek equiv +A7DB ; λ # V16.0 ; U+A7DB LATIN SMALL LETTER LAMBDA ➡ greek equiv +A7DC ; Λ 0337 # V16.0 ; U+A7DC LATIN CAPITAL LETTER LAMBDA WITH STROKE ➡ greek equiv +ƛ ; λ 0337 # V16.0 ; existing Latin variant + +# 165-A37 — L2/20-272 + +1715 ; 1734 # V16.0 ; U+1715 TAGALOG SIGN PAMUDPOD ➡ 1734, Hanunoo Sign Pamudpod + +# 166-A55 — Section 3n of L2/21-016R + +ß ; β # sharp S with beta +ẞ ; ß # sharp S upper/lower +A7D6 ; β # Middle Scots S, uppercase +A7D6 ; β # Middle Scots S, lowercase diff --git a/unicodetools/data/security/dev/data/source/formatted-source.txt b/unicodetools/data/security/dev/data/source/formatted-source.txt index b7475ec78..216c1689e 100644 --- a/unicodetools/data/security/dev/data/source/formatted-source.txt +++ b/unicodetools/data/security/dev/data/source/formatted-source.txt @@ -1,5 +1,5 @@ # formatted-source.txt -# Date: 2024-05-03, 03:26:38 GMT +# Date: 2024-05-31, 21:12:51 GMT # © 2024 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use and license, see https://www.unicode.org/terms_of_use.html @@ -899,6 +899,9 @@ 00DE ; 104C4 # ( Þ ~ 𐓄 ) LATIN CAPITAL LETTER THORN ~ OSAGE CAPITAL LETTER PA +00DF ; 1E9E # ( ß ~ ẞ ) LATIN SMALL LETTER SHARP S ~ LATIN CAPITAL LETTER SHARP S +00DF ; 03B2 # ( ß ~ β ) LATIN SMALL LETTER SHARP S ~ GREEK SMALL LETTER BETA + 00E5 ; 0227 # ( å ~ ȧ ) LATIN SMALL LETTER A WITH RING ABOVE ~ LATIN SMALL LETTER A WITH DOT ABOVE 00F0 ; 1E8CD # ( ð ~ ‎𞣍‎ ) LATIN SMALL LETTER ETH ~ MENDE KIKAKUI DIGIT SEVEN @@ -1295,6 +1298,8 @@ 039B ; A6CE # ( Λ ~ ꛎ ) GREEK CAPITAL LETTER LAMDA ~ BAMUM LETTER MI 039B ; 1028D # ( Λ ~ 𐊍 ) GREEK CAPITAL LETTER LAMDA ~ LYCIAN LETTER L +039B 0337 ; A7DC # ( Λ̷ ~ Ƛ ) GREEK CAPITAL LETTER LAMDA, COMBINING SHORT SOLIDUS OVERLAY ~ LATIN CAPITAL LETTER LAMBDA WITH STROKE + 039C ; 041C # ( Μ ~ М ) GREEK CAPITAL LETTER MU ~ CYRILLIC CAPITAL LETTER EM 039C ; 216F # ( Μ ~ Ⅿ ) GREEK CAPITAL LETTER MU ~ ROMAN NUMERAL ONE THOUSAND @@ -1354,8 +1359,11 @@ 03BA ; 043A # ( κ ~ к ) GREEK SMALL LETTER KAPPA ~ CYRILLIC SMALL LETTER KA +03BB ; A7DB # ( λ ~ ꟛ ) GREEK SMALL LETTER LAMDA ~ LATIN SMALL LETTER LAMBDA 03BB ; 104DB # ( λ ~ 𐓛 ) GREEK SMALL LETTER LAMDA ~ OSAGE SMALL LETTER AH +03BB 0337 ; 019B # ( λ̷ ~ ƛ ) GREEK SMALL LETTER LAMDA, COMBINING SHORT SOLIDUS OVERLAY ~ LATIN SMALL LETTER LAMBDA WITH STROKE + 03BD ; 2174 # ( ν ~ ⅴ ) GREEK SMALL LETTER NU ~ SMALL ROMAN NUMERAL FIVE 03BF ; 043E # ( ο ~ о ) GREEK SMALL LETTER OMICRON ~ CYRILLIC SMALL LETTER O @@ -3599,6 +3607,8 @@ 16EF ; 2D63 # ( ᛯ ~ ⵣ ) RUNIC TVIMADUR SYMBOL ~ TIFINAGH LETTER YAZ +1715 ; 1734 # ( ᜕ ~ ᜴ ) TAGALOG SIGN PAMUDPOD ~ HANUNOO SIGN PAMUDPOD + 17A2 ; 17A3 # ( អ ~ ឣ ) KHMER LETTER QA ~ KHMER INDEPENDENT VOWEL QAQ 185C ; 1896 # ( ᡜ ~ ᢖ ) MONGOLIAN LETTER TODO DZA ~ MONGOLIAN LETTER ALI GALI ZA @@ -4030,6 +4040,8 @@ 5024 ; 503C # ( 値 ~ 值 ) CJK UNIFIED IDEOGRAPH-5024 ~ CJK UNIFIED IDEOGRAPH-503C +513F ; 3126 # ( 儿 ~ ㄦ ) CJK UNIFIED IDEOGRAPH-513F ~ BOPOMOFO LETTER ER + 5553 ; 555F # ( 啓 ~ 啟 ) CJK UNIFIED IDEOGRAPH-5553 ~ CJK UNIFIED IDEOGRAPH-555F 5861 ; 586B # ( 塡 ~ 填 ) CJK UNIFIED IDEOGRAPH-5861 ~ CJK UNIFIED IDEOGRAPH-586B @@ -4158,6 +4170,10 @@ A792 ; 0404 # ( Ꞓ ~ Є ) LATIN CAPITAL LETTER C WITH BAR ~ CYRILLIC CAPITAL LE A793 ; 0454 # ( ꞓ ~ є ) LATIN SMALL LETTER C WITH BAR ~ CYRILLIC SMALL LETTER UKRAINIAN IE +A7D6 ; 03B2 # ( Ꟗ ~ β ) LATIN CAPITAL LETTER MIDDLE SCOTS S ~ GREEK SMALL LETTER BETA + +A7DA ; 039B # ( Ꟛ ~ Λ ) LATIN CAPITAL LETTER LAMBDA ~ GREEK CAPITAL LETTER LAMDA + A7FB ; 15B7 # ( ꟻ ~ ᖷ ) LATIN EPIGRAPHIC LETTER REVERSED F ~ CANADIAN SYLLABICS BLACKFOOT WA A7FB ; 1D230 # ( ꟻ ~ 𝈰 ) LATIN EPIGRAPHIC LETTER REVERSED F ~ GREEK INSTRUMENTAL NOTATION SYMBOL-30 From 1cbe050eea371cb45bd2c54ea6df5c9b163fbc5c Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 5 Jun 2024 05:15:23 +0200 Subject: [PATCH 06/10] HST invariants (#850) Fix #848 --- .../org/unicode/text/UCD/UnicodeInvariantTest.txt | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt index 3eb759238..3b3f3c35a 100644 --- a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt +++ b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt @@ -863,6 +863,20 @@ Let $TwoVietnameseReadingMarks = [\p{U15.1.0:ccc=6}] # an LV or V, respectively. [\p{NFC_QC=Maybe}&\p{ccc=0}] ⊆ [\p{GCB=Extend}\p{GCB=T}\p{GCB=V}] +# ICU relies on this to avoid carrying data for HST which would be mostly +# redundant with GCB. If this breaks, it should be noted on the landing page, +# and ICU-TC should be notified. +# See https://github.com/unicode-org/icu/pull/3026. +\p{HST=V} = [\p{GCB=V} & [\u0000-\uFFFF]] +# A more principled (if less practically useful) statement is that the +# dual-conjoining Hangul characters are exactly the Hangul vowels. +\p{HST=V} = [\p{GCB=V} & \p{Script=Hangul}] +# The other types are still straightforwardly related to their GCB counterparts. +\p{HST=L} = \p{GCB=L} +\p{HST=LV} = \p{GCB=LV} +\p{HST=LVT} = \p{GCB=LVT} +\p{HST=T} = \p{GCB=T} + ########################## # Emoji ########################## From fadec8f779b3b7a569c4eed827bad00f76a9da2f Mon Sep 17 00:00:00 2001 From: Ned Holbrook Date: Wed, 5 Jun 2024 10:17:31 -0700 Subject: [PATCH 07/10] Move emojis from food-marine to animal-marine (#753), new CLDR localizations for Mx claus (#844) --- unicodetools/data/emoji/dev/emoji-test.txt | 34 +++++++++---------- .../data/emoji/dev/emoji-zwj-sequences.txt | 14 ++++---- .../emoji/dev/internal/emoji-proposals.txt | 2 +- .../org/unicode/tools/emoji/emojiOrdering.txt | 4 +-- 4 files changed, 25 insertions(+), 29 deletions(-) diff --git a/unicodetools/data/emoji/dev/emoji-test.txt b/unicodetools/data/emoji/dev/emoji-test.txt index e81fe0b19..95c5d5431 100644 --- a/unicodetools/data/emoji/dev/emoji-test.txt +++ b/unicodetools/data/emoji/dev/emoji-test.txt @@ -1,5 +1,5 @@ # emoji-test.txt -# Date: 2024-05-01, 21:25:24 GMT +# Date: 2024-06-04, 16:46:01 GMT # © 2024 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use and license, see https://www.unicode.org/terms_of_use.html @@ -1751,12 +1751,12 @@ 1F936 1F3FD ; fully-qualified # 🤶🏽 E3.0 Mrs. Claus: medium skin tone 1F936 1F3FE ; fully-qualified # 🤶🏾 E3.0 Mrs. Claus: medium-dark skin tone 1F936 1F3FF ; fully-qualified # 🤶🏿 E3.0 Mrs. Claus: dark skin tone -1F9D1 200D 1F384 ; fully-qualified # 🧑‍🎄 E13.0 mx claus -1F9D1 1F3FB 200D 1F384 ; fully-qualified # 🧑🏻‍🎄 E13.0 mx claus: light skin tone -1F9D1 1F3FC 200D 1F384 ; fully-qualified # 🧑🏼‍🎄 E13.0 mx claus: medium-light skin tone -1F9D1 1F3FD 200D 1F384 ; fully-qualified # 🧑🏽‍🎄 E13.0 mx claus: medium skin tone -1F9D1 1F3FE 200D 1F384 ; fully-qualified # 🧑🏾‍🎄 E13.0 mx claus: medium-dark skin tone -1F9D1 1F3FF 200D 1F384 ; fully-qualified # 🧑🏿‍🎄 E13.0 mx claus: dark skin tone +1F9D1 200D 1F384 ; fully-qualified # 🧑‍🎄 E13.0 Mx claus +1F9D1 1F3FB 200D 1F384 ; fully-qualified # 🧑🏻‍🎄 E13.0 Mx claus: light skin tone +1F9D1 1F3FC 200D 1F384 ; fully-qualified # 🧑🏼‍🎄 E13.0 Mx claus: medium-light skin tone +1F9D1 1F3FD 200D 1F384 ; fully-qualified # 🧑🏽‍🎄 E13.0 Mx claus: medium skin tone +1F9D1 1F3FE 200D 1F384 ; fully-qualified # 🧑🏾‍🎄 E13.0 Mx claus: medium-dark skin tone +1F9D1 1F3FF 200D 1F384 ; fully-qualified # 🧑🏿‍🎄 E13.0 Mx claus: dark skin tone 1F9B8 ; fully-qualified # 🦸 E11.0 superhero 1F9B8 1F3FB ; fully-qualified # 🦸🏻 E11.0 superhero: light skin tone 1F9B8 1F3FC ; fully-qualified # 🦸🏼 E11.0 superhero: medium-light skin tone @@ -3721,6 +3721,11 @@ 1F41A ; fully-qualified # 🐚 E0.6 spiral shell 1FAB8 ; fully-qualified # 🪸 E14.0 coral 1FABC ; fully-qualified # 🪼 E15.0 jellyfish +1F980 ; fully-qualified # 🦀 E1.0 crab +1F99E ; fully-qualified # 🦞 E11.0 lobster +1F990 ; fully-qualified # 🦐 E3.0 shrimp +1F991 ; fully-qualified # 🦑 E3.0 squid +1F9AA ; fully-qualified # 🦪 E12.0 oyster # subgroup: animal-bug 1F40C ; fully-qualified # 🐌 E0.6 snail @@ -3777,8 +3782,8 @@ 1F344 ; fully-qualified # 🍄 E0.6 mushroom 1FABE ; fully-qualified # 🪾 E16.0 leafless tree -# Animals & Nature subtotal: 161 -# Animals & Nature subtotal: 161 w/o modifiers +# Animals & Nature subtotal: 166 +# Animals & Nature subtotal: 166 w/o modifiers # group: Food & Drink @@ -3881,13 +3886,6 @@ 1F960 ; fully-qualified # 🥠 E5.0 fortune cookie 1F961 ; fully-qualified # 🥡 E5.0 takeout box -# subgroup: food-marine -1F980 ; fully-qualified # 🦀 E1.0 crab -1F99E ; fully-qualified # 🦞 E11.0 lobster -1F990 ; fully-qualified # 🦐 E3.0 shrimp -1F991 ; fully-qualified # 🦑 E3.0 squid -1F9AA ; fully-qualified # 🦪 E12.0 oyster - # subgroup: food-sweet 1F366 ; fully-qualified # 🍦 E0.6 soft ice cream 1F367 ; fully-qualified # 🍧 E0.6 shaved ice @@ -3936,8 +3934,8 @@ 1FAD9 ; fully-qualified # 🫙 E14.0 jar 1F3FA ; fully-qualified # 🏺 E1.0 amphora -# Food & Drink subtotal: 138 -# Food & Drink subtotal: 138 w/o modifiers +# Food & Drink subtotal: 133 +# Food & Drink subtotal: 133 w/o modifiers # group: Travel & Places diff --git a/unicodetools/data/emoji/dev/emoji-zwj-sequences.txt b/unicodetools/data/emoji/dev/emoji-zwj-sequences.txt index e471645bf..ce31f22ce 100644 --- a/unicodetools/data/emoji/dev/emoji-zwj-sequences.txt +++ b/unicodetools/data/emoji/dev/emoji-zwj-sequences.txt @@ -1,5 +1,5 @@ # emoji-zwj-sequences.txt -# Date: 2024-05-01, 21:25:24 GMT +# Date: 2024-06-04, 16:46:01 GMT # © 2024 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use and license, see https://www.unicode.org/terms_of_use.html @@ -665,7 +665,7 @@ 1F9D1 200D 1F33E ; RGI_Emoji_ZWJ_Sequence ; farmer # E12.1 [1] (🧑‍🌾) 1F9D1 200D 1F373 ; RGI_Emoji_ZWJ_Sequence ; cook # E12.1 [1] (🧑‍🍳) 1F9D1 200D 1F37C ; RGI_Emoji_ZWJ_Sequence ; person feeding baby # E13.0 [1] (🧑‍🍼) -1F9D1 200D 1F384 ; RGI_Emoji_ZWJ_Sequence ; mx claus # E13.0 [1] (🧑‍🎄) +1F9D1 200D 1F384 ; RGI_Emoji_ZWJ_Sequence ; Mx claus # E13.0 [1] (🧑‍🎄) 1F9D1 200D 1F393 ; RGI_Emoji_ZWJ_Sequence ; student # E12.1 [1] (🧑‍🎓) 1F9D1 200D 1F3A4 ; RGI_Emoji_ZWJ_Sequence ; singer # E12.1 [1] (🧑‍🎤) 1F9D1 200D 1F3A8 ; RGI_Emoji_ZWJ_Sequence ; artist # E12.1 [1] (🧑‍🎨) @@ -689,7 +689,7 @@ 1F9D1 1F3FB 200D 1F33E ; RGI_Emoji_ZWJ_Sequence ; farmer: light skin tone # E12.1 [1] (🧑🏻‍🌾) 1F9D1 1F3FB 200D 1F373 ; RGI_Emoji_ZWJ_Sequence ; cook: light skin tone # E12.1 [1] (🧑🏻‍🍳) 1F9D1 1F3FB 200D 1F37C ; RGI_Emoji_ZWJ_Sequence ; person feeding baby: light skin tone # E13.0 [1] (🧑🏻‍🍼) -1F9D1 1F3FB 200D 1F384 ; RGI_Emoji_ZWJ_Sequence ; mx claus: light skin tone # E13.0 [1] (🧑🏻‍🎄) +1F9D1 1F3FB 200D 1F384 ; RGI_Emoji_ZWJ_Sequence ; Mx claus: light skin tone # E13.0 [1] (🧑🏻‍🎄) 1F9D1 1F3FB 200D 1F393 ; RGI_Emoji_ZWJ_Sequence ; student: light skin tone # E12.1 [1] (🧑🏻‍🎓) 1F9D1 1F3FB 200D 1F3A4 ; RGI_Emoji_ZWJ_Sequence ; singer: light skin tone # E12.1 [1] (🧑🏻‍🎤) 1F9D1 1F3FB 200D 1F3A8 ; RGI_Emoji_ZWJ_Sequence ; artist: light skin tone # E12.1 [1] (🧑🏻‍🎨) @@ -713,7 +713,7 @@ 1F9D1 1F3FC 200D 1F33E ; RGI_Emoji_ZWJ_Sequence ; farmer: medium-light skin tone # E12.1 [1] (🧑🏼‍🌾) 1F9D1 1F3FC 200D 1F373 ; RGI_Emoji_ZWJ_Sequence ; cook: medium-light skin tone # E12.1 [1] (🧑🏼‍🍳) 1F9D1 1F3FC 200D 1F37C ; RGI_Emoji_ZWJ_Sequence ; person feeding baby: medium-light skin tone # E13.0 [1] (🧑🏼‍🍼) -1F9D1 1F3FC 200D 1F384 ; RGI_Emoji_ZWJ_Sequence ; mx claus: medium-light skin tone # E13.0 [1] (🧑🏼‍🎄) +1F9D1 1F3FC 200D 1F384 ; RGI_Emoji_ZWJ_Sequence ; Mx claus: medium-light skin tone # E13.0 [1] (🧑🏼‍🎄) 1F9D1 1F3FC 200D 1F393 ; RGI_Emoji_ZWJ_Sequence ; student: medium-light skin tone # E12.1 [1] (🧑🏼‍🎓) 1F9D1 1F3FC 200D 1F3A4 ; RGI_Emoji_ZWJ_Sequence ; singer: medium-light skin tone # E12.1 [1] (🧑🏼‍🎤) 1F9D1 1F3FC 200D 1F3A8 ; RGI_Emoji_ZWJ_Sequence ; artist: medium-light skin tone # E12.1 [1] (🧑🏼‍🎨) @@ -737,7 +737,7 @@ 1F9D1 1F3FD 200D 1F33E ; RGI_Emoji_ZWJ_Sequence ; farmer: medium skin tone # E12.1 [1] (🧑🏽‍🌾) 1F9D1 1F3FD 200D 1F373 ; RGI_Emoji_ZWJ_Sequence ; cook: medium skin tone # E12.1 [1] (🧑🏽‍🍳) 1F9D1 1F3FD 200D 1F37C ; RGI_Emoji_ZWJ_Sequence ; person feeding baby: medium skin tone # E13.0 [1] (🧑🏽‍🍼) -1F9D1 1F3FD 200D 1F384 ; RGI_Emoji_ZWJ_Sequence ; mx claus: medium skin tone # E13.0 [1] (🧑🏽‍🎄) +1F9D1 1F3FD 200D 1F384 ; RGI_Emoji_ZWJ_Sequence ; Mx claus: medium skin tone # E13.0 [1] (🧑🏽‍🎄) 1F9D1 1F3FD 200D 1F393 ; RGI_Emoji_ZWJ_Sequence ; student: medium skin tone # E12.1 [1] (🧑🏽‍🎓) 1F9D1 1F3FD 200D 1F3A4 ; RGI_Emoji_ZWJ_Sequence ; singer: medium skin tone # E12.1 [1] (🧑🏽‍🎤) 1F9D1 1F3FD 200D 1F3A8 ; RGI_Emoji_ZWJ_Sequence ; artist: medium skin tone # E12.1 [1] (🧑🏽‍🎨) @@ -761,7 +761,7 @@ 1F9D1 1F3FE 200D 1F33E ; RGI_Emoji_ZWJ_Sequence ; farmer: medium-dark skin tone # E12.1 [1] (🧑🏾‍🌾) 1F9D1 1F3FE 200D 1F373 ; RGI_Emoji_ZWJ_Sequence ; cook: medium-dark skin tone # E12.1 [1] (🧑🏾‍🍳) 1F9D1 1F3FE 200D 1F37C ; RGI_Emoji_ZWJ_Sequence ; person feeding baby: medium-dark skin tone # E13.0 [1] (🧑🏾‍🍼) -1F9D1 1F3FE 200D 1F384 ; RGI_Emoji_ZWJ_Sequence ; mx claus: medium-dark skin tone # E13.0 [1] (🧑🏾‍🎄) +1F9D1 1F3FE 200D 1F384 ; RGI_Emoji_ZWJ_Sequence ; Mx claus: medium-dark skin tone # E13.0 [1] (🧑🏾‍🎄) 1F9D1 1F3FE 200D 1F393 ; RGI_Emoji_ZWJ_Sequence ; student: medium-dark skin tone # E12.1 [1] (🧑🏾‍🎓) 1F9D1 1F3FE 200D 1F3A4 ; RGI_Emoji_ZWJ_Sequence ; singer: medium-dark skin tone # E12.1 [1] (🧑🏾‍🎤) 1F9D1 1F3FE 200D 1F3A8 ; RGI_Emoji_ZWJ_Sequence ; artist: medium-dark skin tone # E12.1 [1] (🧑🏾‍🎨) @@ -785,7 +785,7 @@ 1F9D1 1F3FF 200D 1F33E ; RGI_Emoji_ZWJ_Sequence ; farmer: dark skin tone # E12.1 [1] (🧑🏿‍🌾) 1F9D1 1F3FF 200D 1F373 ; RGI_Emoji_ZWJ_Sequence ; cook: dark skin tone # E12.1 [1] (🧑🏿‍🍳) 1F9D1 1F3FF 200D 1F37C ; RGI_Emoji_ZWJ_Sequence ; person feeding baby: dark skin tone # E13.0 [1] (🧑🏿‍🍼) -1F9D1 1F3FF 200D 1F384 ; RGI_Emoji_ZWJ_Sequence ; mx claus: dark skin tone # E13.0 [1] (🧑🏿‍🎄) +1F9D1 1F3FF 200D 1F384 ; RGI_Emoji_ZWJ_Sequence ; Mx claus: dark skin tone # E13.0 [1] (🧑🏿‍🎄) 1F9D1 1F3FF 200D 1F393 ; RGI_Emoji_ZWJ_Sequence ; student: dark skin tone # E12.1 [1] (🧑🏿‍🎓) 1F9D1 1F3FF 200D 1F3A4 ; RGI_Emoji_ZWJ_Sequence ; singer: dark skin tone # E12.1 [1] (🧑🏿‍🎤) 1F9D1 1F3FF 200D 1F3A8 ; RGI_Emoji_ZWJ_Sequence ; artist: dark skin tone # E12.1 [1] (🧑🏿‍🎨) diff --git a/unicodetools/data/emoji/dev/internal/emoji-proposals.txt b/unicodetools/data/emoji/dev/internal/emoji-proposals.txt index edc931dbb..8769e7477 100644 --- a/unicodetools/data/emoji/dev/internal/emoji-proposals.txt +++ b/unicodetools/data/emoji/dev/internal/emoji-proposals.txt @@ -1925,7 +1925,7 @@ 1F9CF 1F3FF; L2/18-229, L2/14-173 # 2019 (🧏🏿) deaf person: dark skin tone # L2/19-231 -1F9D1 200D 1F384; L2/19-231 # 2020 (🧑‍🎄) mx claus +1F9D1 200D 1F384; L2/19-231 # 2020 (🧑‍🎄) Mx claus # L2/19-275, L2/18-223, L2/18-228, L2/19-021, L2/18-340 1F9D1 200D 1F91D 200D 1F9D1; L2/19-275, L2/18-223, L2/18-228, L2/19-021, L2/18-340 # 2019 (🧑‍🤝‍🧑) people holding hands diff --git a/unicodetools/src/main/resources/org/unicode/tools/emoji/emojiOrdering.txt b/unicodetools/src/main/resources/org/unicode/tools/emoji/emojiOrdering.txt index b8a0b631b..f3bfaccdd 100644 --- a/unicodetools/src/main/resources/org/unicode/tools/emoji/emojiOrdering.txt +++ b/unicodetools/src/main/resources/org/unicode/tools/emoji/emojiOrdering.txt @@ -408,7 +408,7 @@ 🐊 🐢 🦎 🐍 🐲 🐉 🦕 🦖 @ animal-marine -🐳 🐋 🐬 🦭 🐟 🐠 🐡 🦈 🐙 🐚 🪸 🪼 +🐳 🐋 🐬 🦭 🐟 🐠 🐡 🦈 🐙 🐚 🪸 🪼 🦀 🦞 🦐 🦑 🦪 @ animal-bug 🐌 🦋 🐛 🐜 🐝 🪲 🐞 🦗 🪳 🕷️ 🕸️ 🦂 🦟 🪰 🪱 🦠 @ plant-flower @@ -428,8 +428,6 @@ 🥫 @ food-asian 🍱 🍘 🍙 🍚 🍛 🍜 🍝 🍠 🍢 🍣 🍤 🍥 🥮 🍡 🥟 🥠 🥡 -@ food-marine -🦀 🦞 🦐 🦑 🦪 @ food-sweet 🍦 🍧 🍨 🍩 🍪 🎂 🍰 🧁 🥧 🍫 🍬 🍭 🍮 🍯 @ drink From 2f2b5b940c41be26e04d0878a97c4f067755bb08 Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Wed, 5 Jun 2024 12:59:23 -0700 Subject: [PATCH 08/10] UCA 16 beta jun05 test files --- unicodetools/data/uca/dev/CollationTest.html | 2 +- .../CollationTest_NON_IGNORABLE.txt | 110 +++++++++-------- .../CollationTest_NON_IGNORABLE_SHORT.txt | 18 ++- .../CollationTest/CollationTest_SHIFTED.txt | 112 ++++++++++-------- .../CollationTest_SHIFTED_SHORT.txt | 16 ++- 5 files changed, 159 insertions(+), 99 deletions(-) diff --git a/unicodetools/data/uca/dev/CollationTest.html b/unicodetools/data/uca/dev/CollationTest.html index f204d01c4..82e72214a 100644 --- a/unicodetools/data/uca/dev/CollationTest.html +++ b/unicodetools/data/uca/dev/CollationTest.html @@ -91,7 +91,7 @@

Testing

If there are any errors, then the UCA implementation is not compliant.

These files contain test cases that include ill-formed strings, with surrogate code points. Implementations that do not weight surrogate code points the same way as reserved code points - may filter out such lines lines in the test cases, before testing for conformance.

+ may filter out such lines in the test cases, before testing for conformance.