diff --git a/.github/workflows/cli-build-instructions.yml b/.github/workflows/cli-build-instructions.yml index 3fd9e8e56..24d4dce1c 100644 --- a/.github/workflows/cli-build-instructions.yml +++ b/.github/workflows/cli-build-instructions.yml @@ -81,8 +81,14 @@ jobs: run: | mkdir -p Generated/BIN - - name: Run command - Build and Test - run: MAVEN_OPTS="-ea" mvn -s .github/workflows/mvn-settings.xml package -DCLDR_DIR=$(cd ../cldr ; pwd) -DUNICODETOOLS_GEN_DIR=$(cd Generated; pwd) -DUNICODETOOLS_REPO_DIR=$(pwd) -DUVERSION=$CURRENT_UVERSION + # Since these are just examples to smoke-test the in-source build process, + # let’s not run the whole build and test suite, which is quite slow (6 min + # 26 s as of this writing). Just run the invariant tests and smoke-test + # MakeUnicodeFiles. We don’t even check that MakeUnicodeFiles doesn’t + # change anything, which makes little sense; but that is the job of the + # other job. + - name: Run invariant tests + run: MAVEN_OPTS="-ea" mvn -s .github/workflows/mvn-settings.xml test -am -pl unicodetools -Dtest=TestTestUnicodeInvariants -DfailIfNoTests=false -DCLDR_DIR=$(cd ../cldr ; pwd) -DUNICODETOOLS_GEN_DIR=$(cd Generated; pwd) -DUNICODETOOLS_REPO_DIR=$(pwd) -DUVERSION=$CURRENT_UVERSION env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} @@ -91,14 +97,15 @@ jobs: env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - out-of-source-build: - name: Out-of-source Instructions + + # Out-of-source build. + ucd-and-smoke-tests: + name: Check UCD consistency, invariants, smoke-test generators runs-on: ubuntu-latest steps: - name: Checkout Unicode Tools uses: actions/checkout@v3 with: - repository: unicode-org/unicodetools path: unicodetools/mine/src - name: Get the CLDR_REF from pom.xml id: cldr_ref @@ -136,6 +143,30 @@ jobs: run: | mkdir -p unicodetools/mine/Generated/BIN + - name: Run command - Make Unicode Files + run: | + cd unicodetools/mine/src + mvn -s .github/workflows/mvn-settings.xml compile exec:java -Dexec.mainClass="org.unicode.text.UCD.Main" -Dexec.args="version $CURRENT_UVERSION build MakeUnicodeFiles" -am -pl unicodetools -DCLDR_DIR=$(cd ../../../cldr/mine/src ; pwd) -DUNICODETOOLS_GEN_DIR=$(cd ../Generated ; pwd) -DUNICODETOOLS_REPO_DIR=$(pwd) -DUVERSION=$CURRENT_UVERSION + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Check that UCD files are consistent + run: | + cd unicodetools/mine/src + ./py/copygenerateducd.py --out-of-source -y + git diff --compact-summary --exit-code || { + git diff --compact-summary | + awk '{ + if (previous) { + print "::error file="previous",title=File must be regenerated::Run org.unicode.text.UCD.Main build MakeUnicodeFiles and copy any changed files to unicodetools/data/ucd/dev." + } + previous=$1 + }' + exit 1 + } + + # Only test once we know the UCD is internally consistent. + # MakeUnicodeFiles is much faster than this anyway. - name: Run command - Build and Test run: | cd unicodetools/mine/src @@ -151,13 +182,6 @@ jobs: path: | unicodetools/mine/Generated/UnicodeTestResults.* - - name: Run command - Make Unicode Files - run: | - cd unicodetools/mine/src - mvn -s .github/workflows/mvn-settings.xml compile exec:java -Dexec.mainClass="org.unicode.text.UCD.Main" -Dexec.args="version $CURRENT_UVERSION build MakeUnicodeFiles" -am -pl unicodetools -DCLDR_DIR=$(cd ../../../cldr/mine/src ; pwd) -DUNICODETOOLS_GEN_DIR=$(cd ../Generated ; pwd) -DUNICODETOOLS_REPO_DIR=$(pwd) -DUVERSION=$CURRENT_UVERSION - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # https://github.com/unicode-org/unicodetools/blob/main/docs/emoji/aac.md#aacorderjava - name: Run command - AAC Order run: | @@ -166,18 +190,6 @@ jobs: env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # https://github.com/unicode-org/unicodetools/blob/main/docs/uca/index.md#tools--tests - # Note: Not running desuffixucd.py in UCA jobs because no version numbers detected in data file names - - name: Run command - UCA - collation validity log - run: | - cd unicodetools/mine/src - # invoke main() in class ...UCA.Main - mvn -s .github/workflows/mvn-settings.xml compile exec:java -Dexec.mainClass="org.unicode.text.UCA.Main" -Dexec.args="writeCollationValidityLog ICU" -am -pl unicodetools -DCLDR_DIR=$(cd ../../../cldr/mine/src ; pwd) -DUNICODETOOLS_GEN_DIR=$(cd ../Generated ; pwd) -DUNICODETOOLS_REPO_DIR=$(pwd) -DUVERSION=$CURRENT_UVERSION - # check for output file - compgen -G "../Generated/UCA/*/CheckCollationValidity.html" - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # https://github.com/unicode-org/unicodetools/blob/main/docs/idna.md - name: Run command - IDNA run: | @@ -252,3 +264,61 @@ jobs: mvn -s .github/workflows/mvn-settings.xml -Dexec.mainClass="org.unicode.propstest.CheckProperties" -Dexec.classpathScope=test test-compile -Dexec.args="COMPARE ALL $PREVIOUS_UVERSION" compile exec:java -am -pl unicodetools -DCLDR_DIR=$(cd ../../../cldr/mine/src ; pwd) -DUNICODETOOLS_GEN_DIR=$(cd ../Generated ; pwd) -DUNICODETOOLS_REPO_DIR=$(pwd) -DUVERSION=$CURRENT_UVERSION env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + # Out-of-source build. + uca: + name: Check UCA data + runs-on: ubuntu-latest + steps: + - name: Checkout Unicode Tools + uses: actions/checkout@v3 + with: + repository: unicode-org/unicodetools + path: unicodetools/mine/src + - name: Get the CLDR_REF from pom.xml + id: cldr_ref + run: echo "CLDR_REF="$(mvn --file unicodetools/mine/src/pom.xml help:evaluate -Dexpression=cldr.version -q -DforceStdout | cut -d- -f3) >> $GITHUB_OUTPUT && cat ${GITHUB_OUTPUT} + - name: Verify CLDR checkout ref + run: echo CLDR_REF="${{ steps.cldr_ref.outputs.CLDR_REF }}" && [ "${{ steps.cldr_ref.outputs.CLDR_REF }}x" != "x" ] # fail if empty + - name: Cache CLDR repository + uses: actions/cache@v3 + with: + path: cldr/mine/src + key: cldr-${{ steps.cldr_ref.outputs.CLDR_REF }} + restore-keys: | + cldr + - name: Check out CLDR + uses: actions/checkout@v3 + with: + repository: unicode-org/cldr + path: cldr/mine/src + ref: main + fetch-depth: 0 + - name: Switch CLDR to CLDR_REF + run: cd cldr/mine/src && git fetch && git checkout ${{ steps.cldr_ref.outputs.CLDR_REF }} + - name: Set up JDK 11 + uses: actions/setup-java@v1 + with: + java-version: 11 + - name: Cache local Maven repository + uses: actions/cache@v2 + with: + path: ~/.m2/repository + key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }} + restore-keys: | + ${{ runner.os }}-maven- + - name: Set up out-of-source output dir + run: | + mkdir -p unicodetools/mine/Generated/BIN + + # https://github.com/unicode-org/unicodetools/blob/main/docs/uca/index.md#tools--tests + # Note: Not running desuffixucd.py in UCA jobs because no version numbers detected in data file names + - name: Run command - UCA - collation validity log + run: | + cd unicodetools/mine/src + # invoke main() in class ...UCA.Main + mvn -s .github/workflows/mvn-settings.xml compile exec:java -Dexec.mainClass="org.unicode.text.UCA.Main" -Dexec.args="writeCollationValidityLog ICU" -am -pl unicodetools -DCLDR_DIR=$(cd ../../../cldr/mine/src ; pwd) -DUNICODETOOLS_GEN_DIR=$(cd ../Generated ; pwd) -DUNICODETOOLS_REPO_DIR=$(pwd) -DUVERSION=$CURRENT_UVERSION + # check for output file + compgen -G "../Generated/UCA/*/CheckCollationValidity.html" + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/UnicodeJsps/jetty.d/ROOT/robots.txt b/UnicodeJsps/jetty.d/ROOT/robots.txt new file mode 100644 index 000000000..a40ff93be --- /dev/null +++ b/UnicodeJsps/jetty.d/ROOT/robots.txt @@ -0,0 +1,2 @@ +User-agent: * +Disallow: /UnicodeJsps diff --git a/UnicodeJsps/pom.xml b/UnicodeJsps/pom.xml index 98f0e75b2..83d01106f 100644 --- a/UnicodeJsps/pom.xml +++ b/UnicodeJsps/pom.xml @@ -65,7 +65,7 @@ com.google.guava guava - 29.0-jre + 32.0.0-jre diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/CachedProps.java b/UnicodeJsps/src/main/java/org/unicode/jsp/CachedProps.java index 35e845d58..3f78c440b 100644 --- a/UnicodeJsps/src/main/java/org/unicode/jsp/CachedProps.java +++ b/UnicodeJsps/src/main/java/org/unicode/jsp/CachedProps.java @@ -29,7 +29,7 @@ import org.unicode.props.UnicodeProperty; public class CachedProps { - public static final boolean IS_BETA = true; + public static final boolean IS_BETA = false; public static final Splitter HASH_SPLITTER = Splitter.on('#').trimResults(); public static final Splitter SEMI_SPLITTER = Splitter.on(';').trimResults(); diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeUtilities.java b/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeUtilities.java index aebbfd12e..f4050d8dd 100644 --- a/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeUtilities.java +++ b/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeUtilities.java @@ -637,16 +637,7 @@ private void showString(final String string, String separator, Appendable out) if (UnicodeUtilities.RTL.containsSome(literal)) { literal = '\u200E' + literal + '\u200E'; } - String name = UnicodeUtilities.getName(string, separator, false); - if (name == null || name.length() == 0) { - name = "no name"; - } else { - boolean special = name.indexOf('<') >= 0; - name = UnicodeUtilities.toHTML.transliterate(name); - if (special) { - name = "" + name + ""; - } - } + String name = UnicodeUtilities.getName(string, separator, false, false); literal = UnicodeSetUtilities.addEmojiVariation(literal); if (doTable) { out.append( @@ -801,7 +792,8 @@ String getPropString(List props, String codePoints, boolean sho // } } - private static String getName(String string, String separator, boolean andCode) { + private static String getName( + String string, String separator, boolean andCode, boolean plainText) { StringBuilder result = new StringBuilder(); int cp; for (int i = 0; i < string.length(); i += UTF16.getCharCount(cp)) { @@ -812,7 +804,25 @@ private static String getName(String string, String separator, boolean andCode) if (andCode) { result.append("U+").append(com.ibm.icu.impl.Utility.hex(cp, 4)).append(' '); } - result.append(CachedProps.NAMES.getValue(cp)); + final String name = CachedProps.NAMES.getValue(cp); + if (name != null) { + result.append(name); + } else { + // TODO(egg): We only have Name_Aliasβ during β, which is silly. This will probably + // solve itself as part of https://github.com/unicode-org/unicodetools/issues/432. + String alias = + getFactory() + .getProperty(CachedProps.IS_BETA ? "Name_Aliasβ" : "Name_Alias") + .getValue(cp); + if (alias == null) { + alias = "no name"; + } + if (plainText) { + result.append("(" + alias + ")"); + } else { + result.append("" + alias + ""); + } + } } return result.toString(); } @@ -1931,7 +1941,7 @@ private static void showBidiLine( writer.println("Character"); for (int i = 0; i < str.length(); ++i) { final String s = str.substring(i, i + 1); - String title = toHTML.transform(getName(s, "", true)); + String title = toHTML.transform(getName(s, "", true, true)); writer.println( ", and make sure that there aren't any Z-Other props at the bottom (you'll need to update via Adding New Properties if there are). -(:construction: **TODO**: explain how to do a Docker-based build here.) +### Running a Docker-based build + +compile java stuff + +- `mvn -B package -am -pl UnicodeJsps -DskipTests=true` + +”backup” copy of CLDR and UnicodeTools. (`~/src/cldr` is an optional existing CLDR dir to save a few packets) + +- `git clone --reference-if-able ~/src/cldr https://github.com/unicode-org/cldr.git || (cd cldr && git pull)` +- `mkdir -p UnicodeJsps/target && tar -cpz --exclude=.git --exclude=unicodetools/target/ -f UnicodeJsps/target/cldr-unicodetools.tgz ./cldr/ ./unicodetools/` + +Now, finally build. + +- `docker build -t unicode/unicode-jsp:latest UnicodeJsps/` + +… And run. Control-C to cancel it, otherwise visit + +``` +docker run --rm -p 8080:8080 unicode/unicode-jsp:latest +``` ## Commit/PR diff --git a/py/copygenerateducd.py b/py/copygenerateducd.py old mode 100644 new mode 100755 index a1a8f2f73..1b64f116b --- a/py/copygenerateducd.py +++ b/py/copygenerateducd.py @@ -17,9 +17,10 @@ def main(): + out_of_source = '--out-of-source' in sys.argv[1:] cwd = Path().cwd() uversion = os.getenv("CURRENT_UVERSION") - genucddir = cwd / "Generated" / "UCD" / uversion + genucddir = (cwd / ".." if out_of_source else cwd) / "Generated" / "UCD" / uversion if not genucddir.exists(): raise Exception(f"Generated directory not found at {genucddir.absolute()}") @@ -34,7 +35,7 @@ def main(): print("THE FOLLOWING FILES WILL BE MOVED:\n") print("\n".join([f"{str(p.name)} --> {devucddir / p.relative_to(genucddir)}" for p in to_move])) # noqa: E501 - confirm = bool(sys.argv[-1] == "-y") # enable running this in automation + confirm = bool("-y" in sys.argv[1:]) # enable running this in automation if not confirm: confirm = input("\nProceed [y/N]?").lower() == "y" diff --git a/unicodetools/data/ucd/dev/ArabicShaping.txt b/unicodetools/data/ucd/dev/ArabicShaping.txt index dd8cb333e..0def17a03 100644 --- a/unicodetools/data/ucd/dev/ArabicShaping.txt +++ b/unicodetools/data/ucd/dev/ArabicShaping.txt @@ -828,6 +828,11 @@ A873; PHAGS-PA CANDRABINDU; U; No_Joining_Group 10D22; HANIFI ROHINGYA SAKIN; R; No_Joining_Group 10D23; HANIFI ROHINGYA DOTLESS KINNA YA WITH DOT ABOVE; D; HANIFI ROHINGYA KINNA YA +# Arabic Extended-D Characters +10EC2; DAL WITH VERTICAL 2 DOTS BELOW; R; DAL +10EC3; TAH WITH VERTICAL 2 DOTS BELOW; D; TAH +10EC4; KAF WITH VERTICAL 2 DOTS BELOW; D; KAF + # Sogdian Characters 10F30; SOGDIAN ALEPH; D; No_Joining_Group diff --git a/unicodetools/data/ucd/dev/Blocks.txt b/unicodetools/data/ucd/dev/Blocks.txt index 8fa3eaad0..15fbbd0a3 100644 --- a/unicodetools/data/ucd/dev/Blocks.txt +++ b/unicodetools/data/ucd/dev/Blocks.txt @@ -274,6 +274,7 @@ FFF0..FFFF; Specials 11AB0..11ABF; Unified Canadian Aboriginal Syllabics Extended-A 11AC0..11AFF; Pau Cin Hau 11B00..11B5F; Devanagari Extended-A +11BC0..11BFF; Sunuwar 11C00..11C6F; Bhaiksuki 11C70..11CBF; Marchen 11D00..11D5F; Masaram Gondi diff --git a/unicodetools/data/ucd/dev/CaseFolding.txt b/unicodetools/data/ucd/dev/CaseFolding.txt index 69c5c64b4..ba43df3ec 100644 --- a/unicodetools/data/ucd/dev/CaseFolding.txt +++ b/unicodetools/data/ucd/dev/CaseFolding.txt @@ -1,5 +1,5 @@ -# CaseFolding-15.1.0.txt -# Date: 2023-05-12, 21:53:10 GMT +# CaseFolding-16.0.0.txt +# Date: 2023-10-03, 19:01:21 GMT # © 2023 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see https://www.unicode.org/terms_of_use.html @@ -603,6 +603,7 @@ 1C86; C; 044A; # CYRILLIC SMALL LETTER TALL HARD SIGN 1C87; C; 0463; # CYRILLIC SMALL LETTER TALL YAT 1C88; C; A64B; # CYRILLIC SMALL LETTER UNBLENDED UK +1C89; C; 1C8A; # CYRILLIC CAPITAL LETTER TJE 1C90; C; 10D0; # GEORGIAN MTAVRULI CAPITAL LETTER AN 1C91; C; 10D1; # GEORGIAN MTAVRULI CAPITAL LETTER BAN 1C92; C; 10D2; # GEORGIAN MTAVRULI CAPITAL LETTER GAN diff --git a/unicodetools/data/ucd/dev/DerivedAge.txt b/unicodetools/data/ucd/dev/DerivedAge.txt index ae3327f3c..eedf1bc69 100644 --- a/unicodetools/data/ucd/dev/DerivedAge.txt +++ b/unicodetools/data/ucd/dev/DerivedAge.txt @@ -1,5 +1,5 @@ # DerivedAge-16.0.0.txt -# Date: 2023-10-02, 12:51:03 GMT +# Date: 2023-10-13, 15:52:11 GMT # © 2023 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see https://www.unicode.org/terms_of_use.html @@ -2009,9 +2009,15 @@ FDFE..FDFF ; 14.0 # [2] ARABIC LIGATURE SUBHAANAHU WA TAAALAA..ARABIC LIGAT # Newly assigned in Unicode 16.0.0 (September, 2024) +0897 ; 16.0 # ARABIC PEPET 0C5C ; 16.0 # TELUGU ARCHAIC SHRII 0CDC ; 16.0 # KANNADA ARCHAIC SHRII +1C89..1C8A ; 16.0 # [2] CYRILLIC CAPITAL LETTER TJE..CYRILLIC SMALL LETTER TJE +10EC2..10EC4 ; 16.0 # [3] ARABIC LETTER DAL WITH TWO DOTS VERTICALLY BELOW..ARABIC LETTER KAF WITH TWO DOTS VERTICALLY BELOW +10EFC ; 16.0 # ARABIC COMBINING ALEF OVERLAY +11BC0..11BE1 ; 16.0 # [34] SUNUWAR LETTER DEVI..SUNUWAR SIGN PVO +11BF0..11BF9 ; 16.0 # [10] SUNUWAR DIGIT ZERO..SUNUWAR DIGIT NINE -# Total code points: 2 +# Total code points: 53 # EOF diff --git a/unicodetools/data/ucd/dev/DerivedCoreProperties.txt b/unicodetools/data/ucd/dev/DerivedCoreProperties.txt index 00ee93c56..132266cb1 100644 --- a/unicodetools/data/ucd/dev/DerivedCoreProperties.txt +++ b/unicodetools/data/ucd/dev/DerivedCoreProperties.txt @@ -1,5 +1,5 @@ # DerivedCoreProperties-16.0.0.txt -# Date: 2023-10-02, 12:51:30 GMT +# Date: 2023-10-13, 15:52:30 GMT # © 2023 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see https://www.unicode.org/terms_of_use.html @@ -343,6 +343,7 @@ FFE9..FFEC ; Math # Sm [4] HALFWIDTH LEFTWARDS ARROW..HALFWIDTH DOWNWARDS A 0860..086A ; Alphabetic # Lo [11] SYRIAC LETTER MALAYALAM NGA..SYRIAC LETTER MALAYALAM SSA 0870..0887 ; Alphabetic # Lo [24] ARABIC LETTER ALEF WITH ATTACHED FATHA..ARABIC BASELINE ROUND DOT 0889..088E ; Alphabetic # Lo [6] ARABIC LETTER NOON WITH INVERTED SMALL V..ARABIC VERTICAL TAIL +0897 ; Alphabetic # Mn ARABIC PEPET 08A0..08C8 ; Alphabetic # Lo [41] ARABIC LETTER BEH WITH SMALL V BELOW..ARABIC LETTER GRAF 08C9 ; Alphabetic # Lm ARABIC SMALL FARSI YEH 08D4..08DF ; Alphabetic # Mn [12] ARABIC SMALL HIGH WORD AR-RUB..ARABIC SMALL HIGH WORD WAQFA @@ -710,7 +711,7 @@ FFE9..FFEC ; Math # Sm [4] HALFWIDTH LEFTWARDS ARROW..HALFWIDTH DOWNWARDS A 1C4D..1C4F ; Alphabetic # Lo [3] LEPCHA LETTER TTA..LEPCHA LETTER DDA 1C5A..1C77 ; Alphabetic # Lo [30] OL CHIKI LETTER LA..OL CHIKI LETTER OH 1C78..1C7D ; Alphabetic # Lm [6] OL CHIKI MU TTUDDAG..OL CHIKI AHAD -1C80..1C88 ; Alphabetic # L& [9] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER UNBLENDED UK +1C80..1C8A ; Alphabetic # L& [11] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER TJE 1C90..1CBA ; Alphabetic # L& [43] GEORGIAN MTAVRULI CAPITAL LETTER AN..GEORGIAN MTAVRULI CAPITAL LETTER AIN 1CBD..1CBF ; Alphabetic # L& [3] GEORGIAN MTAVRULI CAPITAL LETTER AEN..GEORGIAN MTAVRULI CAPITAL LETTER LABIAL SIGN 1CE9..1CEC ; Alphabetic # Lo [4] VEDIC SIGN ANUSVARA ANTARGOMUKHA..VEDIC SIGN ANUSVARA VAMAGOMUKHA WITH TAIL @@ -1041,6 +1042,8 @@ FFDA..FFDC ; Alphabetic # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANG 10E80..10EA9 ; Alphabetic # Lo [42] YEZIDI LETTER ELIF..YEZIDI LETTER ET 10EAB..10EAC ; Alphabetic # Mn [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK 10EB0..10EB1 ; Alphabetic # Lo [2] YEZIDI LETTER LAM WITH DOT ABOVE..YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE +10EC2..10EC4 ; Alphabetic # Lo [3] ARABIC LETTER DAL WITH TWO DOTS VERTICALLY BELOW..ARABIC LETTER KAF WITH TWO DOTS VERTICALLY BELOW +10EFC ; Alphabetic # Mn ARABIC COMBINING ALEF OVERLAY 10F00..10F1C ; Alphabetic # Lo [29] OLD SOGDIAN LETTER ALEPH..OLD SOGDIAN LETTER FINAL TAW WITH VERTICAL TAIL 10F27 ; Alphabetic # Lo OLD SOGDIAN LIGATURE AYIN-DALETH 10F30..10F45 ; Alphabetic # Lo [22] SOGDIAN LETTER ALEPH..SOGDIAN INDEPENDENT SHIN @@ -1211,6 +1214,7 @@ FFDA..FFDC ; Alphabetic # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANG 11A97 ; Alphabetic # Mc SOYOMBO SIGN VISARGA 11A9D ; Alphabetic # Lo SOYOMBO MARK PLUTA 11AB0..11AF8 ; Alphabetic # Lo [73] CANADIAN SYLLABICS NATTILIK HI..PAU CIN HAU GLOTTAL STOP FINAL +11BC0..11BE0 ; Alphabetic # Lo [33] SUNUWAR LETTER DEVI..SUNUWAR LETTER KLOKO 11C00..11C08 ; Alphabetic # Lo [9] BHAIKSUKI LETTER A..BHAIKSUKI LETTER VOCALIC L 11C0A..11C2E ; Alphabetic # Lo [37] BHAIKSUKI LETTER E..BHAIKSUKI LETTER HA 11C2F ; Alphabetic # Mc BHAIKSUKI VOWEL SIGN AA @@ -1402,7 +1406,7 @@ FFDA..FFDC ; Alphabetic # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANG 30000..3134A ; Alphabetic # Lo [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A 31350..323AF ; Alphabetic # Lo [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF -# Total code points: 138389 +# Total code points: 138429 # ================================================ @@ -1691,6 +1695,7 @@ FFDA..FFDC ; Alphabetic # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANG 10FD..10FF ; Lowercase # L& [3] GEORGIAN LETTER AEN..GEORGIAN LETTER LABIAL SIGN 13F8..13FD ; Lowercase # L& [6] CHEROKEE SMALL LETTER YE..CHEROKEE SMALL LETTER MV 1C80..1C88 ; Lowercase # L& [9] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER UNBLENDED UK +1C8A ; Lowercase # L& CYRILLIC SMALL LETTER TJE 1D00..1D2B ; Lowercase # L& [44] LATIN LETTER SMALL CAPITAL A..CYRILLIC LETTER SMALL CAPITAL EL 1D2C..1D6A ; Lowercase # Lm [63] MODIFIER LETTER CAPITAL A..GREEK SUBSCRIPT SMALL LETTER CHI 1D6B..1D77 ; Lowercase # L& [13] LATIN SMALL LETTER UE..LATIN SMALL LETTER TURNED G @@ -2096,7 +2101,7 @@ FF41..FF5A ; Lowercase # L& [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH L 1E030..1E06D ; Lowercase # Lm [62] MODIFIER LETTER CYRILLIC SMALL A..MODIFIER LETTER CYRILLIC SMALL STRAIGHT U WITH STROKE 1E922..1E943 ; Lowercase # L& [34] ADLAM SMALL LETTER ALIF..ADLAM SMALL LETTER SHA -# Total code points: 2544 +# Total code points: 2545 # ================================================ @@ -2379,6 +2384,7 @@ FF41..FF5A ; Lowercase # L& [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH L 10C7 ; Uppercase # L& GEORGIAN CAPITAL LETTER YN 10CD ; Uppercase # L& GEORGIAN CAPITAL LETTER AEN 13A0..13F5 ; Uppercase # L& [86] CHEROKEE LETTER A..CHEROKEE LETTER MV +1C89 ; Uppercase # L& CYRILLIC CAPITAL LETTER TJE 1C90..1CBA ; Uppercase # L& [43] GEORGIAN MTAVRULI CAPITAL LETTER AN..GEORGIAN MTAVRULI CAPITAL LETTER AIN 1CBD..1CBF ; Uppercase # L& [3] GEORGIAN MTAVRULI CAPITAL LETTER AEN..GEORGIAN MTAVRULI CAPITAL LETTER LABIAL SIGN 1E00 ; Uppercase # L& LATIN CAPITAL LETTER A WITH RING BELOW @@ -2755,7 +2761,7 @@ FF21..FF3A ; Uppercase # L& [26] FULLWIDTH LATIN CAPITAL LETTER A..FULLWIDTH 1F150..1F169 ; Uppercase # So [26] NEGATIVE CIRCLED LATIN CAPITAL LETTER A..NEGATIVE CIRCLED LATIN CAPITAL LETTER Z 1F170..1F189 ; Uppercase # So [26] NEGATIVE SQUARED LATIN CAPITAL LETTER A..NEGATIVE SQUARED LATIN CAPITAL LETTER Z -# Total code points: 1951 +# Total code points: 1952 # ================================================ @@ -2800,7 +2806,7 @@ FF21..FF3A ; Uppercase # L& [26] FULLWIDTH LATIN CAPITAL LETTER A..FULLWIDTH 10FD..10FF ; Cased # L& [3] GEORGIAN LETTER AEN..GEORGIAN LETTER LABIAL SIGN 13A0..13F5 ; Cased # L& [86] CHEROKEE LETTER A..CHEROKEE LETTER MV 13F8..13FD ; Cased # L& [6] CHEROKEE SMALL LETTER YE..CHEROKEE SMALL LETTER MV -1C80..1C88 ; Cased # L& [9] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER UNBLENDED UK +1C80..1C8A ; Cased # L& [11] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER TJE 1C90..1CBA ; Cased # L& [43] GEORGIAN MTAVRULI CAPITAL LETTER AN..GEORGIAN MTAVRULI CAPITAL LETTER AIN 1CBD..1CBF ; Cased # L& [3] GEORGIAN MTAVRULI CAPITAL LETTER AEN..GEORGIAN MTAVRULI CAPITAL LETTER LABIAL SIGN 1D00..1D2B ; Cased # L& [44] LATIN LETTER SMALL CAPITAL A..CYRILLIC LETTER SMALL CAPITAL EL @@ -2938,7 +2944,7 @@ FF41..FF5A ; Cased # L& [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN 1F150..1F169 ; Cased # So [26] NEGATIVE CIRCLED LATIN CAPITAL LETTER A..NEGATIVE CIRCLED LATIN CAPITAL LETTER Z 1F170..1F189 ; Cased # So [26] NEGATIVE SQUARED LATIN CAPITAL LETTER A..NEGATIVE SQUARED LATIN CAPITAL LETTER Z -# Total code points: 4526 +# Total code points: 4528 # ================================================ @@ -3015,7 +3021,7 @@ FF41..FF5A ; Cased # L& [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN 0859..085B ; Case_Ignorable # Mn [3] MANDAIC AFFRICATION MARK..MANDAIC GEMINATION MARK 0888 ; Case_Ignorable # Sk ARABIC RAISED ROUND DOT 0890..0891 ; Case_Ignorable # Cf [2] ARABIC POUND MARK ABOVE..ARABIC PIASTRE MARK ABOVE -0898..089F ; Case_Ignorable # Mn [8] ARABIC SMALL HIGH WORD AL-JUZ..ARABIC HALF MADDA OVER MADDA +0897..089F ; Case_Ignorable # Mn [9] ARABIC PEPET..ARABIC HALF MADDA OVER MADDA 08C9 ; Case_Ignorable # Lm ARABIC SMALL FARSI YEH 08CA..08E1 ; Case_Ignorable # Mn [24] ARABIC SMALL HIGH FARSI YEH..ARABIC SMALL HIGH SIGN SAFHA 08E2 ; Case_Ignorable # Cf ARABIC DISPUTED END OF AYAH @@ -3297,7 +3303,7 @@ FFF9..FFFB ; Case_Ignorable # Cf [3] INTERLINEAR ANNOTATION ANCHOR..INTERLI 10AE5..10AE6 ; Case_Ignorable # Mn [2] MANICHAEAN ABBREVIATION MARK ABOVE..MANICHAEAN ABBREVIATION MARK BELOW 10D24..10D27 ; Case_Ignorable # Mn [4] HANIFI ROHINGYA SIGN HARBAHAY..HANIFI ROHINGYA SIGN TASSI 10EAB..10EAC ; Case_Ignorable # Mn [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK -10EFD..10EFF ; Case_Ignorable # Mn [3] ARABIC SMALL LOW WORD SAKTA..ARABIC SMALL LOW WORD MADDA +10EFC..10EFF ; Case_Ignorable # Mn [4] ARABIC COMBINING ALEF OVERLAY..ARABIC SMALL LOW WORD MADDA 10F46..10F50 ; Case_Ignorable # Mn [11] SOGDIAN COMBINING DOT BELOW..SOGDIAN COMBINING STROKE BELOW 10F82..10F85 ; Case_Ignorable # Mn [4] OLD UYGHUR COMBINING DOT ABOVE..OLD UYGHUR COMBINING TWO DOTS BELOW 11001 ; Case_Ignorable # Mn BRAHMI SIGN ANUSVARA @@ -3440,7 +3446,7 @@ E0001 ; Case_Ignorable # Cf LANGUAGE TAG E0020..E007F ; Case_Ignorable # Cf [96] TAG SPACE..CANCEL TAG E0100..E01EF ; Case_Ignorable # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256 -# Total code points: 2707 +# Total code points: 2709 # ================================================ @@ -3724,6 +3730,7 @@ E0100..E01EF ; Case_Ignorable # Mn [240] VARIATION SELECTOR-17..VARIATION SELEC 10C7 ; Changes_When_Lowercased # L& GEORGIAN CAPITAL LETTER YN 10CD ; Changes_When_Lowercased # L& GEORGIAN CAPITAL LETTER AEN 13A0..13F5 ; Changes_When_Lowercased # L& [86] CHEROKEE LETTER A..CHEROKEE LETTER MV +1C89 ; Changes_When_Lowercased # L& CYRILLIC CAPITAL LETTER TJE 1C90..1CBA ; Changes_When_Lowercased # L& [43] GEORGIAN MTAVRULI CAPITAL LETTER AN..GEORGIAN MTAVRULI CAPITAL LETTER AIN 1CBD..1CBF ; Changes_When_Lowercased # L& [3] GEORGIAN MTAVRULI CAPITAL LETTER AEN..GEORGIAN MTAVRULI CAPITAL LETTER LABIAL SIGN 1E00 ; Changes_When_Lowercased # L& LATIN CAPITAL LETTER A WITH RING BELOW @@ -4059,7 +4066,7 @@ FF21..FF3A ; Changes_When_Lowercased # L& [26] FULLWIDTH LATIN CAPITAL LETTE 16E40..16E5F ; Changes_When_Lowercased # L& [32] MEDEFAIDRIN CAPITAL LETTER M..MEDEFAIDRIN CAPITAL LETTER Y 1E900..1E921 ; Changes_When_Lowercased # L& [34] ADLAM CAPITAL LETTER ALIF..ADLAM CAPITAL LETTER SHA -# Total code points: 1433 +# Total code points: 1434 # ================================================ @@ -4357,6 +4364,7 @@ FF21..FF3A ; Changes_When_Lowercased # L& [26] FULLWIDTH LATIN CAPITAL LETTE 10FD..10FF ; Changes_When_Uppercased # L& [3] GEORGIAN LETTER AEN..GEORGIAN LETTER LABIAL SIGN 13F8..13FD ; Changes_When_Uppercased # L& [6] CHEROKEE SMALL LETTER YE..CHEROKEE SMALL LETTER MV 1C80..1C88 ; Changes_When_Uppercased # L& [9] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER UNBLENDED UK +1C8A ; Changes_When_Uppercased # L& CYRILLIC SMALL LETTER TJE 1D79 ; Changes_When_Uppercased # L& LATIN SMALL LETTER INSULAR G 1D7D ; Changes_When_Uppercased # L& LATIN SMALL LETTER P WITH STROKE 1D8E ; Changes_When_Uppercased # L& LATIN SMALL LETTER Z WITH PALATAL HOOK @@ -4696,7 +4704,7 @@ FF41..FF5A ; Changes_When_Uppercased # L& [26] FULLWIDTH LATIN SMALL LETTER 16E60..16E7F ; Changes_When_Uppercased # L& [32] MEDEFAIDRIN SMALL LETTER M..MEDEFAIDRIN SMALL LETTER Y 1E922..1E943 ; Changes_When_Uppercased # L& [34] ADLAM SMALL LETTER ALIF..ADLAM SMALL LETTER SHA -# Total code points: 1525 +# Total code points: 1526 # ================================================ @@ -4993,6 +5001,7 @@ FF41..FF5A ; Changes_When_Uppercased # L& [26] FULLWIDTH LATIN SMALL LETTER 0561..0587 ; Changes_When_Titlecased # L& [39] ARMENIAN SMALL LETTER AYB..ARMENIAN SMALL LIGATURE ECH YIWN 13F8..13FD ; Changes_When_Titlecased # L& [6] CHEROKEE SMALL LETTER YE..CHEROKEE SMALL LETTER MV 1C80..1C88 ; Changes_When_Titlecased # L& [9] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER UNBLENDED UK +1C8A ; Changes_When_Titlecased # L& CYRILLIC SMALL LETTER TJE 1D79 ; Changes_When_Titlecased # L& LATIN SMALL LETTER INSULAR G 1D7D ; Changes_When_Titlecased # L& LATIN SMALL LETTER P WITH STROKE 1D8E ; Changes_When_Titlecased # L& LATIN SMALL LETTER Z WITH PALATAL HOOK @@ -5332,7 +5341,7 @@ FF41..FF5A ; Changes_When_Titlecased # L& [26] FULLWIDTH LATIN SMALL LETTER 16E60..16E7F ; Changes_When_Titlecased # L& [32] MEDEFAIDRIN SMALL LETTER M..MEDEFAIDRIN SMALL LETTER Y 1E922..1E943 ; Changes_When_Titlecased # L& [34] ADLAM SMALL LETTER ALIF..ADLAM SMALL LETTER SHA -# Total code points: 1452 +# Total code points: 1453 # ================================================ @@ -5623,7 +5632,7 @@ FF41..FF5A ; Changes_When_Titlecased # L& [26] FULLWIDTH LATIN SMALL LETTER 10C7 ; Changes_When_Casefolded # L& GEORGIAN CAPITAL LETTER YN 10CD ; Changes_When_Casefolded # L& GEORGIAN CAPITAL LETTER AEN 13F8..13FD ; Changes_When_Casefolded # L& [6] CHEROKEE SMALL LETTER YE..CHEROKEE SMALL LETTER MV -1C80..1C88 ; Changes_When_Casefolded # L& [9] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER UNBLENDED UK +1C80..1C89 ; Changes_When_Casefolded # L& [10] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC CAPITAL LETTER TJE 1C90..1CBA ; Changes_When_Casefolded # L& [43] GEORGIAN MTAVRULI CAPITAL LETTER AN..GEORGIAN MTAVRULI CAPITAL LETTER AIN 1CBD..1CBF ; Changes_When_Casefolded # L& [3] GEORGIAN MTAVRULI CAPITAL LETTER AEN..GEORGIAN MTAVRULI CAPITAL LETTER LABIAL SIGN 1E00 ; Changes_When_Casefolded # L& LATIN CAPITAL LETTER A WITH RING BELOW @@ -5964,7 +5973,7 @@ FF21..FF3A ; Changes_When_Casefolded # L& [26] FULLWIDTH LATIN CAPITAL LETTE 16E40..16E5F ; Changes_When_Casefolded # L& [32] MEDEFAIDRIN CAPITAL LETTER M..MEDEFAIDRIN CAPITAL LETTER Y 1E900..1E921 ; Changes_When_Casefolded # L& [34] ADLAM CAPITAL LETTER ALIF..ADLAM CAPITAL LETTER SHA -# Total code points: 1506 +# Total code points: 1507 # ================================================ @@ -6027,7 +6036,7 @@ FF21..FF3A ; Changes_When_Casefolded # L& [26] FULLWIDTH LATIN CAPITAL LETTE 10FD..10FF ; Changes_When_Casemapped # L& [3] GEORGIAN LETTER AEN..GEORGIAN LETTER LABIAL SIGN 13A0..13F5 ; Changes_When_Casemapped # L& [86] CHEROKEE LETTER A..CHEROKEE LETTER MV 13F8..13FD ; Changes_When_Casemapped # L& [6] CHEROKEE SMALL LETTER YE..CHEROKEE SMALL LETTER MV -1C80..1C88 ; Changes_When_Casemapped # L& [9] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER UNBLENDED UK +1C80..1C8A ; Changes_When_Casemapped # L& [11] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER TJE 1C90..1CBA ; Changes_When_Casemapped # L& [43] GEORGIAN MTAVRULI CAPITAL LETTER AN..GEORGIAN MTAVRULI CAPITAL LETTER AIN 1CBD..1CBF ; Changes_When_Casemapped # L& [3] GEORGIAN MTAVRULI CAPITAL LETTER AEN..GEORGIAN MTAVRULI CAPITAL LETTER LABIAL SIGN 1D79 ; Changes_When_Casemapped # L& LATIN SMALL LETTER INSULAR G @@ -6105,7 +6114,7 @@ FF41..FF5A ; Changes_When_Casemapped # L& [26] FULLWIDTH LATIN SMALL LETTER 16E40..16E7F ; Changes_When_Casemapped # L& [64] MEDEFAIDRIN CAPITAL LETTER M..MEDEFAIDRIN SMALL LETTER Y 1E900..1E943 ; Changes_When_Casemapped # L& [68] ADLAM CAPITAL LETTER ALIF..ADLAM SMALL LETTER SHA -# Total code points: 2927 +# Total code points: 2929 # ================================================ @@ -6364,7 +6373,7 @@ FF41..FF5A ; Changes_When_Casemapped # L& [26] FULLWIDTH LATIN SMALL LETTER 1C4D..1C4F ; ID_Start # Lo [3] LEPCHA LETTER TTA..LEPCHA LETTER DDA 1C5A..1C77 ; ID_Start # Lo [30] OL CHIKI LETTER LA..OL CHIKI LETTER OH 1C78..1C7D ; ID_Start # Lm [6] OL CHIKI MU TTUDDAG..OL CHIKI AHAD -1C80..1C88 ; ID_Start # L& [9] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER UNBLENDED UK +1C80..1C8A ; ID_Start # L& [11] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER TJE 1C90..1CBA ; ID_Start # L& [43] GEORGIAN MTAVRULI CAPITAL LETTER AN..GEORGIAN MTAVRULI CAPITAL LETTER AIN 1CBD..1CBF ; ID_Start # L& [3] GEORGIAN MTAVRULI CAPITAL LETTER AEN..GEORGIAN MTAVRULI CAPITAL LETTER LABIAL SIGN 1CE9..1CEC ; ID_Start # Lo [4] VEDIC SIGN ANUSVARA ANTARGOMUKHA..VEDIC SIGN ANUSVARA VAMAGOMUKHA WITH TAIL @@ -6641,6 +6650,7 @@ FFDA..FFDC ; ID_Start # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL 10D00..10D23 ; ID_Start # Lo [36] HANIFI ROHINGYA LETTER A..HANIFI ROHINGYA MARK NA KHONNA 10E80..10EA9 ; ID_Start # Lo [42] YEZIDI LETTER ELIF..YEZIDI LETTER ET 10EB0..10EB1 ; ID_Start # Lo [2] YEZIDI LETTER LAM WITH DOT ABOVE..YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE +10EC2..10EC4 ; ID_Start # Lo [3] ARABIC LETTER DAL WITH TWO DOTS VERTICALLY BELOW..ARABIC LETTER KAF WITH TWO DOTS VERTICALLY BELOW 10F00..10F1C ; ID_Start # Lo [29] OLD SOGDIAN LETTER ALEPH..OLD SOGDIAN LETTER FINAL TAW WITH VERTICAL TAIL 10F27 ; ID_Start # Lo OLD SOGDIAN LIGATURE AYIN-DALETH 10F30..10F45 ; ID_Start # Lo [22] SOGDIAN LETTER ALEPH..SOGDIAN INDEPENDENT SHIN @@ -6713,6 +6723,7 @@ FFDA..FFDC ; ID_Start # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL 11A5C..11A89 ; ID_Start # Lo [46] SOYOMBO LETTER KA..SOYOMBO CLUSTER-INITIAL LETTER SA 11A9D ; ID_Start # Lo SOYOMBO MARK PLUTA 11AB0..11AF8 ; ID_Start # Lo [73] CANADIAN SYLLABICS NATTILIK HI..PAU CIN HAU GLOTTAL STOP FINAL +11BC0..11BE0 ; ID_Start # Lo [33] SUNUWAR LETTER DEVI..SUNUWAR LETTER KLOKO 11C00..11C08 ; ID_Start # Lo [9] BHAIKSUKI LETTER A..BHAIKSUKI LETTER VOCALIC L 11C0A..11C2E ; ID_Start # Lo [37] BHAIKSUKI LETTER E..BHAIKSUKI LETTER HA 11C40 ; ID_Start # Lo BHAIKSUKI SIGN AVAGRAHA @@ -6859,7 +6870,7 @@ FFDA..FFDC ; ID_Start # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL 30000..3134A ; ID_Start # Lo [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A 31350..323AF ; ID_Start # Lo [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF -# Total code points: 136969 +# Total code points: 137007 # ================================================ @@ -6966,7 +6977,7 @@ FFDA..FFDC ; ID_Start # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL 0860..086A ; ID_Continue # Lo [11] SYRIAC LETTER MALAYALAM NGA..SYRIAC LETTER MALAYALAM SSA 0870..0887 ; ID_Continue # Lo [24] ARABIC LETTER ALEF WITH ATTACHED FATHA..ARABIC BASELINE ROUND DOT 0889..088E ; ID_Continue # Lo [6] ARABIC LETTER NOON WITH INVERTED SMALL V..ARABIC VERTICAL TAIL -0898..089F ; ID_Continue # Mn [8] ARABIC SMALL HIGH WORD AL-JUZ..ARABIC HALF MADDA OVER MADDA +0897..089F ; ID_Continue # Mn [9] ARABIC PEPET..ARABIC HALF MADDA OVER MADDA 08A0..08C8 ; ID_Continue # Lo [41] ARABIC LETTER BEH WITH SMALL V BELOW..ARABIC LETTER GRAF 08C9 ; ID_Continue # Lm ARABIC SMALL FARSI YEH 08CA..08E1 ; ID_Continue # Mn [24] ARABIC SMALL HIGH FARSI YEH..ARABIC SMALL HIGH SIGN SAFHA @@ -7399,7 +7410,7 @@ FFDA..FFDC ; ID_Start # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL 1C50..1C59 ; ID_Continue # Nd [10] OL CHIKI DIGIT ZERO..OL CHIKI DIGIT NINE 1C5A..1C77 ; ID_Continue # Lo [30] OL CHIKI LETTER LA..OL CHIKI LETTER OH 1C78..1C7D ; ID_Continue # Lm [6] OL CHIKI MU TTUDDAG..OL CHIKI AHAD -1C80..1C88 ; ID_Continue # L& [9] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER UNBLENDED UK +1C80..1C8A ; ID_Continue # L& [11] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER TJE 1C90..1CBA ; ID_Continue # L& [43] GEORGIAN MTAVRULI CAPITAL LETTER AN..GEORGIAN MTAVRULI CAPITAL LETTER AIN 1CBD..1CBF ; ID_Continue # L& [3] GEORGIAN MTAVRULI CAPITAL LETTER AEN..GEORGIAN MTAVRULI CAPITAL LETTER LABIAL SIGN 1CD0..1CD2 ; ID_Continue # Mn [3] VEDIC TONE KARSHANA..VEDIC TONE PRENKHA @@ -7782,7 +7793,8 @@ FFDA..FFDC ; ID_Continue # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HAN 10E80..10EA9 ; ID_Continue # Lo [42] YEZIDI LETTER ELIF..YEZIDI LETTER ET 10EAB..10EAC ; ID_Continue # Mn [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK 10EB0..10EB1 ; ID_Continue # Lo [2] YEZIDI LETTER LAM WITH DOT ABOVE..YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE -10EFD..10EFF ; ID_Continue # Mn [3] ARABIC SMALL LOW WORD SAKTA..ARABIC SMALL LOW WORD MADDA +10EC2..10EC4 ; ID_Continue # Lo [3] ARABIC LETTER DAL WITH TWO DOTS VERTICALLY BELOW..ARABIC LETTER KAF WITH TWO DOTS VERTICALLY BELOW +10EFC..10EFF ; ID_Continue # Mn [4] ARABIC COMBINING ALEF OVERLAY..ARABIC SMALL LOW WORD MADDA 10F00..10F1C ; ID_Continue # Lo [29] OLD SOGDIAN LETTER ALEPH..OLD SOGDIAN LETTER FINAL TAW WITH VERTICAL TAIL 10F27 ; ID_Continue # Lo OLD SOGDIAN LIGATURE AYIN-DALETH 10F30..10F45 ; ID_Continue # Lo [22] SOGDIAN LETTER ALEPH..SOGDIAN INDEPENDENT SHIN @@ -7988,6 +8000,8 @@ FFDA..FFDC ; ID_Continue # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HAN 11A98..11A99 ; ID_Continue # Mn [2] SOYOMBO GEMINATION MARK..SOYOMBO SUBJOINER 11A9D ; ID_Continue # Lo SOYOMBO MARK PLUTA 11AB0..11AF8 ; ID_Continue # Lo [73] CANADIAN SYLLABICS NATTILIK HI..PAU CIN HAU GLOTTAL STOP FINAL +11BC0..11BE0 ; ID_Continue # Lo [33] SUNUWAR LETTER DEVI..SUNUWAR LETTER KLOKO +11BF0..11BF9 ; ID_Continue # Nd [10] SUNUWAR DIGIT ZERO..SUNUWAR DIGIT NINE 11C00..11C08 ; ID_Continue # Lo [9] BHAIKSUKI LETTER A..BHAIKSUKI LETTER VOCALIC L 11C0A..11C2E ; ID_Continue # Lo [37] BHAIKSUKI LETTER E..BHAIKSUKI LETTER HA 11C2F ; ID_Continue # Mc BHAIKSUKI VOWEL SIGN AA @@ -8218,7 +8232,7 @@ FFDA..FFDC ; ID_Continue # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HAN 31350..323AF ; ID_Continue # Lo [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF E0100..E01EF ; ID_Continue # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256 -# Total code points: 140110 +# Total code points: 140160 # ================================================ @@ -8474,7 +8488,7 @@ E0100..E01EF ; ID_Continue # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR 1C4D..1C4F ; XID_Start # Lo [3] LEPCHA LETTER TTA..LEPCHA LETTER DDA 1C5A..1C77 ; XID_Start # Lo [30] OL CHIKI LETTER LA..OL CHIKI LETTER OH 1C78..1C7D ; XID_Start # Lm [6] OL CHIKI MU TTUDDAG..OL CHIKI AHAD -1C80..1C88 ; XID_Start # L& [9] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER UNBLENDED UK +1C80..1C8A ; XID_Start # L& [11] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER TJE 1C90..1CBA ; XID_Start # L& [43] GEORGIAN MTAVRULI CAPITAL LETTER AN..GEORGIAN MTAVRULI CAPITAL LETTER AIN 1CBD..1CBF ; XID_Start # L& [3] GEORGIAN MTAVRULI CAPITAL LETTER AEN..GEORGIAN MTAVRULI CAPITAL LETTER LABIAL SIGN 1CE9..1CEC ; XID_Start # Lo [4] VEDIC SIGN ANUSVARA ANTARGOMUKHA..VEDIC SIGN ANUSVARA VAMAGOMUKHA WITH TAIL @@ -8755,6 +8769,7 @@ FFDA..FFDC ; XID_Start # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGU 10D00..10D23 ; XID_Start # Lo [36] HANIFI ROHINGYA LETTER A..HANIFI ROHINGYA MARK NA KHONNA 10E80..10EA9 ; XID_Start # Lo [42] YEZIDI LETTER ELIF..YEZIDI LETTER ET 10EB0..10EB1 ; XID_Start # Lo [2] YEZIDI LETTER LAM WITH DOT ABOVE..YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE +10EC2..10EC4 ; XID_Start # Lo [3] ARABIC LETTER DAL WITH TWO DOTS VERTICALLY BELOW..ARABIC LETTER KAF WITH TWO DOTS VERTICALLY BELOW 10F00..10F1C ; XID_Start # Lo [29] OLD SOGDIAN LETTER ALEPH..OLD SOGDIAN LETTER FINAL TAW WITH VERTICAL TAIL 10F27 ; XID_Start # Lo OLD SOGDIAN LIGATURE AYIN-DALETH 10F30..10F45 ; XID_Start # Lo [22] SOGDIAN LETTER ALEPH..SOGDIAN INDEPENDENT SHIN @@ -8827,6 +8842,7 @@ FFDA..FFDC ; XID_Start # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGU 11A5C..11A89 ; XID_Start # Lo [46] SOYOMBO LETTER KA..SOYOMBO CLUSTER-INITIAL LETTER SA 11A9D ; XID_Start # Lo SOYOMBO MARK PLUTA 11AB0..11AF8 ; XID_Start # Lo [73] CANADIAN SYLLABICS NATTILIK HI..PAU CIN HAU GLOTTAL STOP FINAL +11BC0..11BE0 ; XID_Start # Lo [33] SUNUWAR LETTER DEVI..SUNUWAR LETTER KLOKO 11C00..11C08 ; XID_Start # Lo [9] BHAIKSUKI LETTER A..BHAIKSUKI LETTER VOCALIC L 11C0A..11C2E ; XID_Start # Lo [37] BHAIKSUKI LETTER E..BHAIKSUKI LETTER HA 11C40 ; XID_Start # Lo BHAIKSUKI SIGN AVAGRAHA @@ -8973,7 +8989,7 @@ FFDA..FFDC ; XID_Start # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGU 30000..3134A ; XID_Start # Lo [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A 31350..323AF ; XID_Start # Lo [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF -# Total code points: 136946 +# Total code points: 136984 # ================================================ @@ -9076,7 +9092,7 @@ FFDA..FFDC ; XID_Start # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGU 0860..086A ; XID_Continue # Lo [11] SYRIAC LETTER MALAYALAM NGA..SYRIAC LETTER MALAYALAM SSA 0870..0887 ; XID_Continue # Lo [24] ARABIC LETTER ALEF WITH ATTACHED FATHA..ARABIC BASELINE ROUND DOT 0889..088E ; XID_Continue # Lo [6] ARABIC LETTER NOON WITH INVERTED SMALL V..ARABIC VERTICAL TAIL -0898..089F ; XID_Continue # Mn [8] ARABIC SMALL HIGH WORD AL-JUZ..ARABIC HALF MADDA OVER MADDA +0897..089F ; XID_Continue # Mn [9] ARABIC PEPET..ARABIC HALF MADDA OVER MADDA 08A0..08C8 ; XID_Continue # Lo [41] ARABIC LETTER BEH WITH SMALL V BELOW..ARABIC LETTER GRAF 08C9 ; XID_Continue # Lm ARABIC SMALL FARSI YEH 08CA..08E1 ; XID_Continue # Mn [24] ARABIC SMALL HIGH FARSI YEH..ARABIC SMALL HIGH SIGN SAFHA @@ -9509,7 +9525,7 @@ FFDA..FFDC ; XID_Start # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGU 1C50..1C59 ; XID_Continue # Nd [10] OL CHIKI DIGIT ZERO..OL CHIKI DIGIT NINE 1C5A..1C77 ; XID_Continue # Lo [30] OL CHIKI LETTER LA..OL CHIKI LETTER OH 1C78..1C7D ; XID_Continue # Lm [6] OL CHIKI MU TTUDDAG..OL CHIKI AHAD -1C80..1C88 ; XID_Continue # L& [9] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER UNBLENDED UK +1C80..1C8A ; XID_Continue # L& [11] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER TJE 1C90..1CBA ; XID_Continue # L& [43] GEORGIAN MTAVRULI CAPITAL LETTER AN..GEORGIAN MTAVRULI CAPITAL LETTER AIN 1CBD..1CBF ; XID_Continue # L& [3] GEORGIAN MTAVRULI CAPITAL LETTER AEN..GEORGIAN MTAVRULI CAPITAL LETTER LABIAL SIGN 1CD0..1CD2 ; XID_Continue # Mn [3] VEDIC TONE KARSHANA..VEDIC TONE PRENKHA @@ -9897,7 +9913,8 @@ FFDA..FFDC ; XID_Continue # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HA 10E80..10EA9 ; XID_Continue # Lo [42] YEZIDI LETTER ELIF..YEZIDI LETTER ET 10EAB..10EAC ; XID_Continue # Mn [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK 10EB0..10EB1 ; XID_Continue # Lo [2] YEZIDI LETTER LAM WITH DOT ABOVE..YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE -10EFD..10EFF ; XID_Continue # Mn [3] ARABIC SMALL LOW WORD SAKTA..ARABIC SMALL LOW WORD MADDA +10EC2..10EC4 ; XID_Continue # Lo [3] ARABIC LETTER DAL WITH TWO DOTS VERTICALLY BELOW..ARABIC LETTER KAF WITH TWO DOTS VERTICALLY BELOW +10EFC..10EFF ; XID_Continue # Mn [4] ARABIC COMBINING ALEF OVERLAY..ARABIC SMALL LOW WORD MADDA 10F00..10F1C ; XID_Continue # Lo [29] OLD SOGDIAN LETTER ALEPH..OLD SOGDIAN LETTER FINAL TAW WITH VERTICAL TAIL 10F27 ; XID_Continue # Lo OLD SOGDIAN LIGATURE AYIN-DALETH 10F30..10F45 ; XID_Continue # Lo [22] SOGDIAN LETTER ALEPH..SOGDIAN INDEPENDENT SHIN @@ -10103,6 +10120,8 @@ FFDA..FFDC ; XID_Continue # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HA 11A98..11A99 ; XID_Continue # Mn [2] SOYOMBO GEMINATION MARK..SOYOMBO SUBJOINER 11A9D ; XID_Continue # Lo SOYOMBO MARK PLUTA 11AB0..11AF8 ; XID_Continue # Lo [73] CANADIAN SYLLABICS NATTILIK HI..PAU CIN HAU GLOTTAL STOP FINAL +11BC0..11BE0 ; XID_Continue # Lo [33] SUNUWAR LETTER DEVI..SUNUWAR LETTER KLOKO +11BF0..11BF9 ; XID_Continue # Nd [10] SUNUWAR DIGIT ZERO..SUNUWAR DIGIT NINE 11C00..11C08 ; XID_Continue # Lo [9] BHAIKSUKI LETTER A..BHAIKSUKI LETTER VOCALIC L 11C0A..11C2E ; XID_Continue # Lo [37] BHAIKSUKI LETTER E..BHAIKSUKI LETTER HA 11C2F ; XID_Continue # Mc BHAIKSUKI VOWEL SIGN AA @@ -10333,7 +10352,7 @@ FFDA..FFDC ; XID_Continue # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HA 31350..323AF ; XID_Continue # Lo [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF E0100..E01EF ; XID_Continue # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256 -# Total code points: 140091 +# Total code points: 140141 # ================================================ @@ -10418,7 +10437,7 @@ E01F0..E0FFF ; Default_Ignorable_Code_Point # Cn [3600] ...... -# Total code points: 10491 +# Total code points: 10492 # ================================================ @@ -9652,6 +9653,7 @@ E01F0..E0FFF ; NFKC_CF; # Cn [3600] ...... -# Total code points: 10453 +# Total code points: 10454 # ================================================ @@ -15411,7 +15413,7 @@ E01F0..E0FFF ; NFKC_SCF; # Cn [3600] ...... -# Total code points: 10491 +# Total code points: 10492 # EOF diff --git a/unicodetools/data/ucd/dev/EastAsianWidth.txt b/unicodetools/data/ucd/dev/EastAsianWidth.txt index 8651fd388..171c4350c 100644 --- a/unicodetools/data/ucd/dev/EastAsianWidth.txt +++ b/unicodetools/data/ucd/dev/EastAsianWidth.txt @@ -1,5 +1,5 @@ # EastAsianWidth-16.0.0.txt -# Date: 2023-10-02, 12:51:38 GMT +# Date: 2023-10-13, 15:52:36 GMT # © 2023 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see https://www.unicode.org/terms_of_use.html @@ -334,7 +334,7 @@ 0888 ; N # Sk ARABIC RAISED ROUND DOT 0889..088E ; N # Lo [6] ARABIC LETTER NOON WITH INVERTED SMALL V..ARABIC VERTICAL TAIL 0890..0891 ; N # Cf [2] ARABIC POUND MARK ABOVE..ARABIC PIASTRE MARK ABOVE -0898..089F ; N # Mn [8] ARABIC SMALL HIGH WORD AL-JUZ..ARABIC HALF MADDA OVER MADDA +0897..089F ; N # Mn [9] ARABIC PEPET..ARABIC HALF MADDA OVER MADDA 08A0..08C8 ; N # Lo [41] ARABIC LETTER BEH WITH SMALL V BELOW..ARABIC LETTER GRAF 08C9 ; N # Lm ARABIC SMALL FARSI YEH 08CA..08E1 ; N # Mn [24] ARABIC SMALL HIGH FARSI YEH..ARABIC SMALL HIGH SIGN SAFHA @@ -859,7 +859,7 @@ 1C5A..1C77 ; N # Lo [30] OL CHIKI LETTER LA..OL CHIKI LETTER OH 1C78..1C7D ; N # Lm [6] OL CHIKI MU TTUDDAG..OL CHIKI AHAD 1C7E..1C7F ; N # Po [2] OL CHIKI PUNCTUATION MUCAAD..OL CHIKI PUNCTUATION DOUBLE MUCAAD -1C80..1C88 ; N # Ll [9] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER UNBLENDED UK +1C80..1C8A ; N # L& [11] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER TJE 1C90..1CBA ; N # Lu [43] GEORGIAN MTAVRULI CAPITAL LETTER AN..GEORGIAN MTAVRULI CAPITAL LETTER AIN 1CBD..1CBF ; N # Lu [3] GEORGIAN MTAVRULI CAPITAL LETTER AEN..GEORGIAN MTAVRULI CAPITAL LETTER LABIAL SIGN 1CC0..1CC7 ; N # Po [8] SUNDANESE PUNCTUATION BINDU SURYA..SUNDANESE PUNCTUATION BINDU BA SATANGA @@ -1947,7 +1947,8 @@ FFFD ; A # So REPLACEMENT CHARACTER 10EAB..10EAC ; N # Mn [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK 10EAD ; N # Pd YEZIDI HYPHENATION MARK 10EB0..10EB1 ; N # Lo [2] YEZIDI LETTER LAM WITH DOT ABOVE..YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE -10EFD..10EFF ; N # Mn [3] ARABIC SMALL LOW WORD SAKTA..ARABIC SMALL LOW WORD MADDA +10EC2..10EC4 ; N # Lo [3] ARABIC LETTER DAL WITH TWO DOTS VERTICALLY BELOW..ARABIC LETTER KAF WITH TWO DOTS VERTICALLY BELOW +10EFC..10EFF ; N # Mn [4] ARABIC COMBINING ALEF OVERLAY..ARABIC SMALL LOW WORD MADDA 10F00..10F1C ; N # Lo [29] OLD SOGDIAN LETTER ALEPH..OLD SOGDIAN LETTER FINAL TAW WITH VERTICAL TAIL 10F1D..10F26 ; N # No [10] OLD SOGDIAN NUMBER ONE..OLD SOGDIAN FRACTION ONE HALF 10F27 ; N # Lo OLD SOGDIAN LIGATURE AYIN-DALETH @@ -2195,6 +2196,9 @@ FFFD ; A # So REPLACEMENT CHARACTER 11AB0..11ABF ; N # Lo [16] CANADIAN SYLLABICS NATTILIK HI..CANADIAN SYLLABICS SPA 11AC0..11AF8 ; N # Lo [57] PAU CIN HAU LETTER PA..PAU CIN HAU GLOTTAL STOP FINAL 11B00..11B09 ; N # Po [10] DEVANAGARI HEAD MARK..DEVANAGARI SIGN MINDU +11BC0..11BE0 ; N # Lo [33] SUNUWAR LETTER DEVI..SUNUWAR LETTER KLOKO +11BE1 ; N # Po SUNUWAR SIGN PVO +11BF0..11BF9 ; N # Nd [10] SUNUWAR DIGIT ZERO..SUNUWAR DIGIT NINE 11C00..11C08 ; N # Lo [9] BHAIKSUKI LETTER A..BHAIKSUKI LETTER VOCALIC L 11C0A..11C2E ; N # Lo [37] BHAIKSUKI LETTER E..BHAIKSUKI LETTER HA 11C2F ; N # Mc BHAIKSUKI VOWEL SIGN AA diff --git a/unicodetools/data/ucd/dev/IndicPositionalCategory.txt b/unicodetools/data/ucd/dev/IndicPositionalCategory.txt index a7c5aef60..9b5aabfa0 100644 --- a/unicodetools/data/ucd/dev/IndicPositionalCategory.txt +++ b/unicodetools/data/ucd/dev/IndicPositionalCategory.txt @@ -1,11 +1,11 @@ -# IndicPositionalCategory-15.1.0.txt -# Date: 2023-01-05 +# IndicPositionalCategory-16.0.0.txt +# Date: 2023-10-02, 22:58:33 GMT # © 2023 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see https://www.unicode.org/terms_of_use.html # -# For documentation, see UAX #44: Unicode Character Database, -# at https://www.unicode.org/reports/tr44/ +# Unicode Character Database +# For documentation, see https://www.unicode.org/reports/tr44/ # # This file defines the following property: # diff --git a/unicodetools/data/ucd/dev/IndicSyllabicCategory.txt b/unicodetools/data/ucd/dev/IndicSyllabicCategory.txt index f2623b471..5de0d7554 100644 --- a/unicodetools/data/ucd/dev/IndicSyllabicCategory.txt +++ b/unicodetools/data/ucd/dev/IndicSyllabicCategory.txt @@ -1,11 +1,11 @@ -# IndicSyllabicCategory-15.1.0.txt -# Date: 2023-01-05 +# IndicSyllabicCategory-16.0.0.txt +# Date: 2023-10-02, 22:58:33 GMT # © 2023 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see https://www.unicode.org/terms_of_use.html # -# For documentation, see UAX #44: Unicode Character Database, -# at https://www.unicode.org/reports/tr44/ +# Unicode Character Database +# For documentation, see https://www.unicode.org/reports/tr44/ # # This file defines the following property: # @@ -1335,7 +1335,7 @@ ABF0..ABF9 ; Number # Nd [10] MEETEI MAYEK DIGIT ZERO..MEETEI MAYEK DIGIT NI # script, e.g. in Brahmi) # # Note: These are different from Numbers, in the way that there is no known -# evidence of Brahmi Joining Numbers taking vowels or subjoined consonants. +# evidence of Brahmi Joining Numbers taking vowels or subjoined consonants. # Until such evidence is found, implementations may assume that Brahmi # Joining Numbers only participate in shaping with other Brahmi Joining # Numbers. diff --git a/unicodetools/data/ucd/dev/LineBreak.txt b/unicodetools/data/ucd/dev/LineBreak.txt index bb8c9a5ae..65cac516f 100644 --- a/unicodetools/data/ucd/dev/LineBreak.txt +++ b/unicodetools/data/ucd/dev/LineBreak.txt @@ -1,5 +1,5 @@ -# LineBreak-15.1.0.txt -# Date: 2023-07-28, 13:19:22 GMT [KW] +# LineBreak-16.0.0.txt +# Date: 2023-10-13, 11:29:24 GMT # © 2023 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see https://www.unicode.org/terms_of_use.html @@ -280,7 +280,7 @@ 0888 ; AL # Sk ARABIC RAISED ROUND DOT 0889..088E ; AL # Lo [6] ARABIC LETTER NOON WITH INVERTED SMALL V..ARABIC VERTICAL TAIL 0890..0891 ; NU # Cf [2] ARABIC POUND MARK ABOVE..ARABIC PIASTRE MARK ABOVE -0898..089F ; CM # Mn [8] ARABIC SMALL HIGH WORD AL-JUZ..ARABIC HALF MADDA OVER MADDA +0897..089F ; CM # Mn [9] ARABIC PEPET..ARABIC HALF MADDA OVER MADDA 08A0..08C8 ; AL # Lo [41] ARABIC LETTER BEH WITH SMALL V BELOW..ARABIC LETTER GRAF 08C9 ; AL # Lm ARABIC SMALL FARSI YEH 08CA..08E1 ; CM # Mn [24] ARABIC SMALL HIGH FARSI YEH..ARABIC SMALL HIGH SIGN SAFHA @@ -832,7 +832,7 @@ 1C5A..1C77 ; AL # Lo [30] OL CHIKI LETTER LA..OL CHIKI LETTER OH 1C78..1C7D ; AL # Lm [6] OL CHIKI MU TTUDDAG..OL CHIKI AHAD 1C7E..1C7F ; BA # Po [2] OL CHIKI PUNCTUATION MUCAAD..OL CHIKI PUNCTUATION DOUBLE MUCAAD -1C80..1C88 ; AL # Ll [9] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER UNBLENDED UK +1C80..1C8A ; AL # L& [11] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER TJE 1C90..1CBA ; AL # Lu [43] GEORGIAN MTAVRULI CAPITAL LETTER AN..GEORGIAN MTAVRULI CAPITAL LETTER AIN 1CBD..1CBF ; AL # Lu [3] GEORGIAN MTAVRULI CAPITAL LETTER AEN..GEORGIAN MTAVRULI CAPITAL LETTER LABIAL SIGN 1CC0..1CC7 ; AL # Po [8] SUNDANESE PUNCTUATION BINDU SURYA..SUNDANESE PUNCTUATION BINDU BA SATANGA @@ -2800,7 +2800,8 @@ FFFD ; AI # So REPLACEMENT CHARACTER 10EAB..10EAC ; CM # Mn [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK 10EAD ; BA # Pd YEZIDI HYPHENATION MARK 10EB0..10EB1 ; AL # Lo [2] YEZIDI LETTER LAM WITH DOT ABOVE..YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE -10EFD..10EFF ; CM # Mn [3] ARABIC SMALL LOW WORD SAKTA..ARABIC SMALL LOW WORD MADDA +10EC2..10EC4 ; AL # Lo [3] ARABIC LETTER DAL WITH TWO DOTS VERTICALLY BELOW..ARABIC LETTER KAF WITH TWO DOTS VERTICALLY BELOW +10EFC..10EFF ; CM # Mn [4] ARABIC COMBINING ALEF OVERLAY..ARABIC SMALL LOW WORD MADDA 10F00..10F1C ; AL # Lo [29] OLD SOGDIAN LETTER ALEPH..OLD SOGDIAN LETTER FINAL TAW WITH VERTICAL TAIL 10F1D..10F26 ; AL # No [10] OLD SOGDIAN NUMBER ONE..OLD SOGDIAN FRACTION ONE HALF 10F27 ; AL # Lo OLD SOGDIAN LIGATURE AYIN-DALETH @@ -3071,6 +3072,9 @@ FFFD ; AI # So REPLACEMENT CHARACTER 11AB0..11ABF ; AL # Lo [16] CANADIAN SYLLABICS NATTILIK HI..CANADIAN SYLLABICS SPA 11AC0..11AF8 ; AL # Lo [57] PAU CIN HAU LETTER PA..PAU CIN HAU GLOTTAL STOP FINAL 11B00..11B09 ; BB # Po [10] DEVANAGARI HEAD MARK..DEVANAGARI SIGN MINDU +11BC0..11BE0 ; AL # Lo [33] SUNUWAR LETTER DEVI..SUNUWAR LETTER KLOKO +11BE1 ; AL # Po SUNUWAR SIGN PVO +11BF0..11BF9 ; NU # Nd [10] SUNUWAR DIGIT ZERO..SUNUWAR DIGIT NINE 11C00..11C08 ; AL # Lo [9] BHAIKSUKI LETTER A..BHAIKSUKI LETTER VOCALIC L 11C0A..11C2E ; AL # Lo [37] BHAIKSUKI LETTER E..BHAIKSUKI LETTER HA 11C2F ; CM # Mc BHAIKSUKI VOWEL SIGN AA diff --git a/unicodetools/data/ucd/dev/NormalizationTest.txt b/unicodetools/data/ucd/dev/NormalizationTest.txt index 2e8857424..757d8f5de 100644 --- a/unicodetools/data/ucd/dev/NormalizationTest.txt +++ b/unicodetools/data/ucd/dev/NormalizationTest.txt @@ -1,5 +1,5 @@ -# NormalizationTest-15.1.0.txt -# Date: 2023-01-05, 20:34:44 GMT +# NormalizationTest-16.0.0.txt +# Date: 2023-10-02, 12:41:11 GMT # © 2023 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see https://www.unicode.org/terms_of_use.html @@ -17664,6 +17664,8 @@ FFEE;FFEE;FFEE;25CB;25CB; # (○; ○; ○; ○; ○; ) HALFWIDTH WHITE CIRCLE 0061 085A 059A 0316 1DFA 0062;0061 1DFA 085A 0316 059A 0062;0061 1DFA 085A 0316 059A 0062;0061 1DFA 085A 0316 059A 0062;0061 1DFA 085A 0316 059A 0062; # (a◌࡚◌֚◌̖◌᷺b; a◌᷺◌࡚◌̖◌֚b; a◌᷺◌࡚◌̖◌֚b; a◌᷺◌࡚◌̖◌֚b; a◌᷺◌࡚◌̖◌֚b; ) LATIN SMALL LETTER A, MANDAIC VOCALIZATION MARK, HEBREW ACCENT YETIV, COMBINING GRAVE ACCENT BELOW, COMBINING DOT BELOW LEFT, LATIN SMALL LETTER B 0061 059A 0316 1DFA 085B 0062;0061 1DFA 0316 085B 059A 0062;0061 1DFA 0316 085B 059A 0062;0061 1DFA 0316 085B 059A 0062;0061 1DFA 0316 085B 059A 0062; # (a◌֚◌̖◌᷺◌࡛b; a◌᷺◌̖◌࡛◌֚b; a◌᷺◌̖◌࡛◌֚b; a◌᷺◌̖◌࡛◌֚b; a◌᷺◌̖◌࡛◌֚b; ) LATIN SMALL LETTER A, HEBREW ACCENT YETIV, COMBINING GRAVE ACCENT BELOW, COMBINING DOT BELOW LEFT, MANDAIC GEMINATION MARK, LATIN SMALL LETTER B 0061 085B 059A 0316 1DFA 0062;0061 1DFA 085B 0316 059A 0062;0061 1DFA 085B 0316 059A 0062;0061 1DFA 085B 0316 059A 0062;0061 1DFA 085B 0316 059A 0062; # (a◌࡛◌֚◌̖◌᷺b; a◌᷺◌࡛◌̖◌֚b; a◌᷺◌࡛◌̖◌֚b; a◌᷺◌࡛◌̖◌֚b; a◌᷺◌࡛◌̖◌֚b; ) LATIN SMALL LETTER A, MANDAIC GEMINATION MARK, HEBREW ACCENT YETIV, COMBINING GRAVE ACCENT BELOW, COMBINING DOT BELOW LEFT, LATIN SMALL LETTER B +0061 0315 0300 05AE 0897 0062;00E0 05AE 0897 0315 0062;0061 05AE 0300 0897 0315 0062;00E0 05AE 0897 0315 0062;0061 05AE 0300 0897 0315 0062; # (a◌̕◌̀◌֮◌ࢗb; à◌֮◌ࢗ◌̕b; a◌֮◌̀◌ࢗ◌̕b; à◌֮◌ࢗ◌̕b; a◌֮◌̀◌ࢗ◌̕b; ) LATIN SMALL LETTER A, COMBINING COMMA ABOVE RIGHT, COMBINING GRAVE ACCENT, HEBREW ACCENT ZINOR, ARABIC PEPET, LATIN SMALL LETTER B +0061 0897 0315 0300 05AE 0062;0061 05AE 0897 0300 0315 0062;0061 05AE 0897 0300 0315 0062;0061 05AE 0897 0300 0315 0062;0061 05AE 0897 0300 0315 0062; # (a◌ࢗ◌̕◌̀◌֮b; a◌֮◌ࢗ◌̀◌̕b; a◌֮◌ࢗ◌̀◌̕b; a◌֮◌ࢗ◌̀◌̕b; a◌֮◌ࢗ◌̀◌̕b; ) LATIN SMALL LETTER A, ARABIC PEPET, COMBINING COMMA ABOVE RIGHT, COMBINING GRAVE ACCENT, HEBREW ACCENT ZINOR, LATIN SMALL LETTER B 0061 0315 0300 05AE 0898 0062;00E0 05AE 0898 0315 0062;0061 05AE 0300 0898 0315 0062;00E0 05AE 0898 0315 0062;0061 05AE 0300 0898 0315 0062; # (a◌̕◌̀◌֮◌࢘b; à◌֮◌࢘◌̕b; a◌֮◌̀◌࢘◌̕b; à◌֮◌࢘◌̕b; a◌֮◌̀◌࢘◌̕b; ) LATIN SMALL LETTER A, COMBINING COMMA ABOVE RIGHT, COMBINING GRAVE ACCENT, HEBREW ACCENT ZINOR, ARABIC SMALL HIGH WORD AL-JUZ, LATIN SMALL LETTER B 0061 0898 0315 0300 05AE 0062;0061 05AE 0898 0300 0315 0062;0061 05AE 0898 0300 0315 0062;0061 05AE 0898 0300 0315 0062;0061 05AE 0898 0300 0315 0062; # (a◌࢘◌̕◌̀◌֮b; a◌֮◌࢘◌̀◌̕b; a◌֮◌࢘◌̀◌̕b; a◌֮◌࢘◌̀◌̕b; a◌֮◌࢘◌̀◌̕b; ) LATIN SMALL LETTER A, ARABIC SMALL HIGH WORD AL-JUZ, COMBINING COMMA ABOVE RIGHT, COMBINING GRAVE ACCENT, HEBREW ACCENT ZINOR, LATIN SMALL LETTER B 0061 059A 0316 1DFA 0899 0062;0061 1DFA 0316 0899 059A 0062;0061 1DFA 0316 0899 059A 0062;0061 1DFA 0316 0899 059A 0062;0061 1DFA 0316 0899 059A 0062; # (a◌֚◌̖◌᷺◌࢙b; a◌᷺◌̖◌࢙◌֚b; a◌᷺◌̖◌࢙◌֚b; a◌᷺◌̖◌࢙◌֚b; a◌᷺◌̖◌࢙◌֚b; ) LATIN SMALL LETTER A, HEBREW ACCENT YETIV, COMBINING GRAVE ACCENT BELOW, COMBINING DOT BELOW LEFT, ARABIC SMALL LOW WORD ISHMAAM, LATIN SMALL LETTER B diff --git a/unicodetools/data/ucd/dev/PropList.txt b/unicodetools/data/ucd/dev/PropList.txt index 777e8a288..8c1f3934d 100644 --- a/unicodetools/data/ucd/dev/PropList.txt +++ b/unicodetools/data/ucd/dev/PropList.txt @@ -1,5 +1,5 @@ -# PropList-15.1.0.txt -# Date: 2023-08-01, 21:56:53 GMT +# PropList-16.0.0.txt +# Date: 2023-10-13, 11:33:44 GMT # © 2023 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see https://www.unicode.org/terms_of_use.html @@ -450,6 +450,7 @@ FF41..FF46 ; Hex_Digit # L& [6] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH L 081B..0823 ; Other_Alphabetic # Mn [9] SAMARITAN MARK EPENTHETIC YUT..SAMARITAN VOWEL SIGN A 0825..0827 ; Other_Alphabetic # Mn [3] SAMARITAN VOWEL SIGN SHORT A..SAMARITAN VOWEL SIGN U 0829..082C ; Other_Alphabetic # Mn [4] SAMARITAN VOWEL SIGN LONG I..SAMARITAN VOWEL SIGN SUKUN +0897 ; Other_Alphabetic # Mn ARABIC PEPET 08D4..08DF ; Other_Alphabetic # Mn [12] ARABIC SMALL HIGH WORD AR-RUB..ARABIC SMALL HIGH WORD WAQFA 08E3..08E9 ; Other_Alphabetic # Mn [7] ARABIC TURNED DAMMA BELOW..ARABIC CURLY KASRATAN 08F0..0902 ; Other_Alphabetic # Mn [19] ARABIC OPEN FATHATAN..DEVANAGARI SIGN ANUSVARA @@ -690,6 +691,7 @@ FB1E ; Other_Alphabetic # Mn HEBREW POINT JUDEO-SPANISH VARIKA 10A0C..10A0F ; Other_Alphabetic # Mn [4] KHAROSHTHI VOWEL LENGTH MARK..KHAROSHTHI SIGN VISARGA 10D24..10D27 ; Other_Alphabetic # Mn [4] HANIFI ROHINGYA SIGN HARBAHAY..HANIFI ROHINGYA SIGN TASSI 10EAB..10EAC ; Other_Alphabetic # Mn [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK +10EFC ; Other_Alphabetic # Mn ARABIC COMBINING ALEF OVERLAY 11000 ; Other_Alphabetic # Mc BRAHMI SIGN CANDRABINDU 11001 ; Other_Alphabetic # Mn BRAHMI SIGN ANUSVARA 11002 ; Other_Alphabetic # Mc BRAHMI SIGN VISARGA @@ -834,7 +836,7 @@ FB1E ; Other_Alphabetic # Mn HEBREW POINT JUDEO-SPANISH VARIKA 1F150..1F169 ; Other_Alphabetic # So [26] NEGATIVE CIRCLED LATIN CAPITAL LETTER A..NEGATIVE CIRCLED LATIN CAPITAL LETTER Z 1F170..1F189 ; Other_Alphabetic # So [26] NEGATIVE SQUARED LATIN CAPITAL LETTER A..NEGATIVE SQUARED LATIN CAPITAL LETTER Z -# Total code points: 1425 +# Total code points: 1427 # ================================================ diff --git a/unicodetools/data/ucd/dev/PropertyValueAliases.txt b/unicodetools/data/ucd/dev/PropertyValueAliases.txt index 9039e9eb2..7dc6c1457 100644 --- a/unicodetools/data/ucd/dev/PropertyValueAliases.txt +++ b/unicodetools/data/ucd/dev/PropertyValueAliases.txt @@ -1,5 +1,5 @@ -# PropertyValueAliases-15.1.0.txt -# Date: 2023-08-07, 15:21:34 GMT +# PropertyValueAliases-16.0.0.txt +# Date: 2023-10-11, 21:16:05 GMT # © 2023 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see https://www.unicode.org/terms_of_use.html @@ -426,6 +426,7 @@ blk; Soyombo ; Soyombo blk; Specials ; Specials blk; Sundanese ; Sundanese blk; Sundanese_Sup ; Sundanese_Supplement +blk; Sunuwar ; Sunuwar blk; Sup_Arrows_A ; Supplemental_Arrows_A blk; Sup_Arrows_B ; Supplemental_Arrows_B blk; Sup_Arrows_C ; Supplemental_Arrows_C @@ -1424,6 +1425,7 @@ sc ; Sogo ; Old_Sogdian sc ; Sora ; Sora_Sompeng sc ; Soyo ; Soyombo sc ; Sund ; Sundanese +sc ; Sunu ; Sunuwar sc ; Sylo ; Syloti_Nagri sc ; Syrc ; Syriac sc ; Tagb ; Tagbanwa diff --git a/unicodetools/data/ucd/dev/Scripts.txt b/unicodetools/data/ucd/dev/Scripts.txt index 40bfadcbb..1d914b667 100644 --- a/unicodetools/data/ucd/dev/Scripts.txt +++ b/unicodetools/data/ucd/dev/Scripts.txt @@ -1,5 +1,5 @@ # Scripts-16.0.0.txt -# Date: 2023-10-02, 12:51:59 GMT +# Date: 2023-10-13, 15:52:54 GMT # © 2023 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see https://www.unicode.org/terms_of_use.html @@ -769,7 +769,7 @@ AB65 ; Greek # L& GREEK LETTER SMALL CAPITAL OMEGA 0487 ; Cyrillic # Mn COMBINING CYRILLIC POKRYTIE 0488..0489 ; Cyrillic # Me [2] COMBINING CYRILLIC HUNDRED THOUSANDS SIGN..COMBINING CYRILLIC MILLIONS SIGN 048A..052F ; Cyrillic # L& [166] CYRILLIC CAPITAL LETTER SHORT I WITH TAIL..CYRILLIC SMALL LETTER EL WITH DESCENDER -1C80..1C88 ; Cyrillic # L& [9] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER UNBLENDED UK +1C80..1C8A ; Cyrillic # L& [11] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER TJE 1D2B ; Cyrillic # L& CYRILLIC LETTER SMALL CAPITAL EL 1D78 ; Cyrillic # Lm MODIFIER LETTER CYRILLIC EN 2DE0..2DFF ; Cyrillic # Mn [32] COMBINING CYRILLIC LETTER BE..COMBINING CYRILLIC LETTER IOTIFIED BIG YUS @@ -788,7 +788,7 @@ FE2E..FE2F ; Cyrillic # Mn [2] COMBINING CYRILLIC TITLO LEFT HALF..COMBININ 1E030..1E06D ; Cyrillic # Lm [62] MODIFIER LETTER CYRILLIC SMALL A..MODIFIER LETTER CYRILLIC SMALL STRAIGHT U WITH STROKE 1E08F ; Cyrillic # Mn COMBINING CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I -# Total code points: 506 +# Total code points: 508 # ================================================ @@ -868,7 +868,7 @@ FB46..FB4F ; Hebrew # Lo [10] HEBREW LETTER TSADI WITH DAGESH..HEBREW LIGATU 0888 ; Arabic # Sk ARABIC RAISED ROUND DOT 0889..088E ; Arabic # Lo [6] ARABIC LETTER NOON WITH INVERTED SMALL V..ARABIC VERTICAL TAIL 0890..0891 ; Arabic # Cf [2] ARABIC POUND MARK ABOVE..ARABIC PIASTRE MARK ABOVE -0898..089F ; Arabic # Mn [8] ARABIC SMALL HIGH WORD AL-JUZ..ARABIC HALF MADDA OVER MADDA +0897..089F ; Arabic # Mn [9] ARABIC PEPET..ARABIC HALF MADDA OVER MADDA 08A0..08C8 ; Arabic # Lo [41] ARABIC LETTER BEH WITH SMALL V BELOW..ARABIC LETTER GRAF 08C9 ; Arabic # Lm ARABIC SMALL FARSI YEH 08CA..08E1 ; Arabic # Mn [24] ARABIC SMALL HIGH FARSI YEH..ARABIC SMALL HIGH SIGN SAFHA @@ -886,7 +886,8 @@ FDFD..FDFF ; Arabic # So [3] ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM. FE70..FE74 ; Arabic # Lo [5] ARABIC FATHATAN ISOLATED FORM..ARABIC KASRATAN ISOLATED FORM FE76..FEFC ; Arabic # Lo [135] ARABIC FATHA ISOLATED FORM..ARABIC LIGATURE LAM WITH ALEF FINAL FORM 10E60..10E7E ; Arabic # No [31] RUMI DIGIT ONE..RUMI FRACTION TWO THIRDS -10EFD..10EFF ; Arabic # Mn [3] ARABIC SMALL LOW WORD SAKTA..ARABIC SMALL LOW WORD MADDA +10EC2..10EC4 ; Arabic # Lo [3] ARABIC LETTER DAL WITH TWO DOTS VERTICALLY BELOW..ARABIC LETTER KAF WITH TWO DOTS VERTICALLY BELOW +10EFC..10EFF ; Arabic # Mn [4] ARABIC COMBINING ALEF OVERLAY..ARABIC SMALL LOW WORD MADDA 1EE00..1EE03 ; Arabic # Lo [4] ARABIC MATHEMATICAL ALEF..ARABIC MATHEMATICAL DAL 1EE05..1EE1F ; Arabic # Lo [27] ARABIC MATHEMATICAL WAW..ARABIC MATHEMATICAL DOTLESS QAF 1EE21..1EE22 ; Arabic # Lo [2] ARABIC MATHEMATICAL INITIAL BEH..ARABIC MATHEMATICAL INITIAL JEEM @@ -922,7 +923,7 @@ FE76..FEFC ; Arabic # Lo [135] ARABIC FATHA ISOLATED FORM..ARABIC LIGATURE LA 1EEAB..1EEBB ; Arabic # Lo [17] ARABIC MATHEMATICAL DOUBLE-STRUCK LAM..ARABIC MATHEMATICAL DOUBLE-STRUCK GHAIN 1EEF0..1EEF1 ; Arabic # Sm [2] ARABIC MATHEMATICAL OPERATOR MEEM WITH HAH WITH TATWEEL..ARABIC MATHEMATICAL OPERATOR HAH WITH DAL -# Total code points: 1368 +# Total code points: 1373 # ================================================ @@ -3030,4 +3031,12 @@ ABF0..ABF9 ; Meetei_Mayek # Nd [10] MEETEI MAYEK DIGIT ZERO..MEETEI MAYEK DI # Total code points: 42 +# ================================================ + +11BC0..11BE0 ; Sunuwar # Lo [33] SUNUWAR LETTER DEVI..SUNUWAR LETTER KLOKO +11BE1 ; Sunuwar # Po SUNUWAR SIGN PVO +11BF0..11BF9 ; Sunuwar # Nd [10] SUNUWAR DIGIT ZERO..SUNUWAR DIGIT NINE + +# Total code points: 44 + # EOF diff --git a/unicodetools/data/ucd/dev/UnicodeData.txt b/unicodetools/data/ucd/dev/UnicodeData.txt index b7586969c..c13107f25 100644 --- a/unicodetools/data/ucd/dev/UnicodeData.txt +++ b/unicodetools/data/ucd/dev/UnicodeData.txt @@ -2123,6 +2123,7 @@ 088E;ARABIC VERTICAL TAIL;Lo;0;AL;;;;;N;;;;; 0890;ARABIC POUND MARK ABOVE;Cf;0;AN;;;;;N;;;;; 0891;ARABIC PIASTRE MARK ABOVE;Cf;0;AN;;;;;N;;;;; +0897;ARABIC PEPET;Mn;230;NSM;;;;;N;;;;; 0898;ARABIC SMALL HIGH WORD AL-JUZ;Mn;230;NSM;;;;;N;;;;; 0899;ARABIC SMALL LOW WORD ISHMAAM;Mn;220;NSM;;;;;N;;;;; 089A;ARABIC SMALL LOW WORD IMAALA;Mn;220;NSM;;;;;N;;;;; @@ -6513,6 +6514,8 @@ 1C86;CYRILLIC SMALL LETTER TALL HARD SIGN;Ll;0;L;;;;;N;;;042A;;042A 1C87;CYRILLIC SMALL LETTER TALL YAT;Ll;0;L;;;;;N;;;0462;;0462 1C88;CYRILLIC SMALL LETTER UNBLENDED UK;Ll;0;L;;;;;N;;;A64A;;A64A +1C89;CYRILLIC CAPITAL LETTER TJE;Lu;0;L;;;;;N;;;;1C8A; +1C8A;CYRILLIC SMALL LETTER TJE;Ll;0;L;;;;;N;;;1C89;;1C89 1C90;GEORGIAN MTAVRULI CAPITAL LETTER AN;Lu;0;L;;;;;N;;;;10D0; 1C91;GEORGIAN MTAVRULI CAPITAL LETTER BAN;Lu;0;L;;;;;N;;;;10D1; 1C92;GEORGIAN MTAVRULI CAPITAL LETTER GAN;Lu;0;L;;;;;N;;;;10D2; @@ -19402,6 +19405,10 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;; 10EAD;YEZIDI HYPHENATION MARK;Pd;0;R;;;;;N;;;;; 10EB0;YEZIDI LETTER LAM WITH DOT ABOVE;Lo;0;R;;;;;N;;;;; 10EB1;YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE;Lo;0;R;;;;;N;;;;; +10EC2;ARABIC LETTER DAL WITH TWO DOTS VERTICALLY BELOW;Lo;0;AL;;;;;N;;;;; +10EC3;ARABIC LETTER TAH WITH TWO DOTS VERTICALLY BELOW;Lo;0;AL;;;;;N;;;;; +10EC4;ARABIC LETTER KAF WITH TWO DOTS VERTICALLY BELOW;Lo;0;AL;;;;;N;;;;; +10EFC;ARABIC COMBINING ALEF OVERLAY;Mn;0;NSM;;;;;N;;;;; 10EFD;ARABIC SMALL LOW WORD SAKTA;Mn;220;NSM;;;;;N;;;;; 10EFE;ARABIC SMALL LOW WORD QASR;Mn;220;NSM;;;;;N;;;;; 10EFF;ARABIC SMALL LOW WORD MADDA;Mn;220;NSM;;;;;N;;;;; @@ -21281,6 +21288,50 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;; 11B07;DEVANAGARI SIGN WESTERN NINE-LIKE BHALE;Po;0;L;;;;;N;;;;; 11B08;DEVANAGARI SIGN REVERSED NINE-LIKE BHALE;Po;0;L;;;;;N;;;;; 11B09;DEVANAGARI SIGN MINDU;Po;0;L;;;;;N;;;;; +11BC0;SUNUWAR LETTER DEVI;Lo;0;L;;;;;N;;;;; +11BC1;SUNUWAR LETTER TASLA;Lo;0;L;;;;;N;;;;; +11BC2;SUNUWAR LETTER EKO;Lo;0;L;;;;;N;;;;; +11BC3;SUNUWAR LETTER IMAR;Lo;0;L;;;;;N;;;;; +11BC4;SUNUWAR LETTER REU;Lo;0;L;;;;;N;;;;; +11BC5;SUNUWAR LETTER UTTHI;Lo;0;L;;;;;N;;;;; +11BC6;SUNUWAR LETTER KIK;Lo;0;L;;;;;N;;;;; +11BC7;SUNUWAR LETTER MA;Lo;0;L;;;;;N;;;;; +11BC8;SUNUWAR LETTER APPHO;Lo;0;L;;;;;N;;;;; +11BC9;SUNUWAR LETTER PIP;Lo;0;L;;;;;N;;;;; +11BCA;SUNUWAR LETTER GIL;Lo;0;L;;;;;N;;;;; +11BCB;SUNUWAR LETTER HAMSO;Lo;0;L;;;;;N;;;;; +11BCC;SUNUWAR LETTER CARMI;Lo;0;L;;;;;N;;;;; +11BCD;SUNUWAR LETTER NAH;Lo;0;L;;;;;N;;;;; +11BCE;SUNUWAR LETTER BUR;Lo;0;L;;;;;N;;;;; +11BCF;SUNUWAR LETTER JYAH;Lo;0;L;;;;;N;;;;; +11BD0;SUNUWAR LETTER LOACHA;Lo;0;L;;;;;N;;;;; +11BD1;SUNUWAR LETTER OTTHI;Lo;0;L;;;;;N;;;;; +11BD2;SUNUWAR LETTER SHYELE;Lo;0;L;;;;;N;;;;; +11BD3;SUNUWAR LETTER VARCA;Lo;0;L;;;;;N;;;;; +11BD4;SUNUWAR LETTER YAT;Lo;0;L;;;;;N;;;;; +11BD5;SUNUWAR LETTER AVA;Lo;0;L;;;;;N;;;;; +11BD6;SUNUWAR LETTER AAL;Lo;0;L;;;;;N;;;;; +11BD7;SUNUWAR LETTER DONGA;Lo;0;L;;;;;N;;;;; +11BD8;SUNUWAR LETTER THARI;Lo;0;L;;;;;N;;;;; +11BD9;SUNUWAR LETTER PHAR;Lo;0;L;;;;;N;;;;; +11BDA;SUNUWAR LETTER NGAR;Lo;0;L;;;;;N;;;;; +11BDB;SUNUWAR LETTER KHA;Lo;0;L;;;;;N;;;;; +11BDC;SUNUWAR LETTER SHYER;Lo;0;L;;;;;N;;;;; +11BDD;SUNUWAR LETTER CHELAP;Lo;0;L;;;;;N;;;;; +11BDE;SUNUWAR LETTER TENTU;Lo;0;L;;;;;N;;;;; +11BDF;SUNUWAR LETTER THELE;Lo;0;L;;;;;N;;;;; +11BE0;SUNUWAR LETTER KLOKO;Lo;0;L;;;;;N;;;;; +11BE1;SUNUWAR SIGN PVO;Po;0;L;;;;;N;;;;; +11BF0;SUNUWAR DIGIT ZERO;Nd;0;L;;0;0;0;N;;;;; +11BF1;SUNUWAR DIGIT ONE;Nd;0;L;;1;1;1;N;;;;; +11BF2;SUNUWAR DIGIT TWO;Nd;0;L;;2;2;2;N;;;;; +11BF3;SUNUWAR DIGIT THREE;Nd;0;L;;3;3;3;N;;;;; +11BF4;SUNUWAR DIGIT FOUR;Nd;0;L;;4;4;4;N;;;;; +11BF5;SUNUWAR DIGIT FIVE;Nd;0;L;;5;5;5;N;;;;; +11BF6;SUNUWAR DIGIT SIX;Nd;0;L;;6;6;6;N;;;;; +11BF7;SUNUWAR DIGIT SEVEN;Nd;0;L;;7;7;7;N;;;;; +11BF8;SUNUWAR DIGIT EIGHT;Nd;0;L;;8;8;8;N;;;;; +11BF9;SUNUWAR DIGIT NINE;Nd;0;L;;9;9;9;N;;;;; 11C00;BHAIKSUKI LETTER A;Lo;0;L;;;;;N;;;;; 11C01;BHAIKSUKI LETTER AA;Lo;0;L;;;;;N;;;;; 11C02;BHAIKSUKI LETTER I;Lo;0;L;;;;;N;;;;; diff --git a/unicodetools/data/ucd/dev/VerticalOrientation.txt b/unicodetools/data/ucd/dev/VerticalOrientation.txt index 1cc701488..541f1428a 100644 --- a/unicodetools/data/ucd/dev/VerticalOrientation.txt +++ b/unicodetools/data/ucd/dev/VerticalOrientation.txt @@ -1,5 +1,5 @@ # VerticalOrientation-16.0.0.txt -# Date: 2023-10-02, 12:52:02 GMT +# Date: 2023-10-13, 15:52:57 GMT # © 2023 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see https://www.unicode.org/terms_of_use.html @@ -268,7 +268,7 @@ 0888 ; R # Sk ARABIC RAISED ROUND DOT 0889..088E ; R # Lo [6] ARABIC LETTER NOON WITH INVERTED SMALL V..ARABIC VERTICAL TAIL 0890..0891 ; R # Cf [2] ARABIC POUND MARK ABOVE..ARABIC PIASTRE MARK ABOVE -0898..089F ; R # Mn [8] ARABIC SMALL HIGH WORD AL-JUZ..ARABIC HALF MADDA OVER MADDA +0897..089F ; R # Mn [9] ARABIC PEPET..ARABIC HALF MADDA OVER MADDA 08A0..08C8 ; R # Lo [41] ARABIC LETTER BEH WITH SMALL V BELOW..ARABIC LETTER GRAF 08C9 ; R # Lm ARABIC SMALL FARSI YEH 08CA..08E1 ; R # Mn [24] ARABIC SMALL HIGH FARSI YEH..ARABIC SMALL HIGH SIGN SAFHA @@ -793,7 +793,7 @@ 1C5A..1C77 ; R # Lo [30] OL CHIKI LETTER LA..OL CHIKI LETTER OH 1C78..1C7D ; R # Lm [6] OL CHIKI MU TTUDDAG..OL CHIKI AHAD 1C7E..1C7F ; R # Po [2] OL CHIKI PUNCTUATION MUCAAD..OL CHIKI PUNCTUATION DOUBLE MUCAAD -1C80..1C88 ; R # Ll [9] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER UNBLENDED UK +1C80..1C8A ; R # L& [11] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER TJE 1C90..1CBA ; R # Lu [43] GEORGIAN MTAVRULI CAPITAL LETTER AN..GEORGIAN MTAVRULI CAPITAL LETTER AIN 1CBD..1CBF ; R # Lu [3] GEORGIAN MTAVRULI CAPITAL LETTER AEN..GEORGIAN MTAVRULI CAPITAL LETTER LABIAL SIGN 1CC0..1CC7 ; R # Po [8] SUNDANESE PUNCTUATION BINDU SURYA..SUNDANESE PUNCTUATION BINDU BA SATANGA @@ -1771,7 +1771,8 @@ FFFC..FFFD ; U # So [2] OBJECT REPLACEMENT CHARACTER..REPLACEMENT CHARA 10EAB..10EAC ; R # Mn [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK 10EAD ; R # Pd YEZIDI HYPHENATION MARK 10EB0..10EB1 ; R # Lo [2] YEZIDI LETTER LAM WITH DOT ABOVE..YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE -10EFD..10EFF ; R # Mn [3] ARABIC SMALL LOW WORD SAKTA..ARABIC SMALL LOW WORD MADDA +10EC2..10EC4 ; R # Lo [3] ARABIC LETTER DAL WITH TWO DOTS VERTICALLY BELOW..ARABIC LETTER KAF WITH TWO DOTS VERTICALLY BELOW +10EFC..10EFF ; R # Mn [4] ARABIC COMBINING ALEF OVERLAY..ARABIC SMALL LOW WORD MADDA 10F00..10F1C ; R # Lo [29] OLD SOGDIAN LETTER ALEPH..OLD SOGDIAN LETTER FINAL TAW WITH VERTICAL TAIL 10F1D..10F26 ; R # No [10] OLD SOGDIAN NUMBER ONE..OLD SOGDIAN FRACTION ONE HALF 10F27 ; R # Lo OLD SOGDIAN LIGATURE AYIN-DALETH @@ -2023,6 +2024,9 @@ FFFC..FFFD ; U # So [2] OBJECT REPLACEMENT CHARACTER..REPLACEMENT CHARA 11AB0..11ABF ; U # Lo [16] CANADIAN SYLLABICS NATTILIK HI..CANADIAN SYLLABICS SPA 11AC0..11AF8 ; R # Lo [57] PAU CIN HAU LETTER PA..PAU CIN HAU GLOTTAL STOP FINAL 11B00..11B09 ; R # Po [10] DEVANAGARI HEAD MARK..DEVANAGARI SIGN MINDU +11BC0..11BE0 ; R # Lo [33] SUNUWAR LETTER DEVI..SUNUWAR LETTER KLOKO +11BE1 ; R # Po SUNUWAR SIGN PVO +11BF0..11BF9 ; R # Nd [10] SUNUWAR DIGIT ZERO..SUNUWAR DIGIT NINE 11C00..11C08 ; R # Lo [9] BHAIKSUKI LETTER A..BHAIKSUKI LETTER VOCALIC L 11C0A..11C2E ; R # Lo [37] BHAIKSUKI LETTER E..BHAIKSUKI LETTER HA 11C2F ; R # Mc BHAIKSUKI VOWEL SIGN AA diff --git a/unicodetools/data/ucd/dev/auxiliary/GraphemeBreakProperty.txt b/unicodetools/data/ucd/dev/auxiliary/GraphemeBreakProperty.txt index 12453cbdb..797d5c000 100644 --- a/unicodetools/data/ucd/dev/auxiliary/GraphemeBreakProperty.txt +++ b/unicodetools/data/ucd/dev/auxiliary/GraphemeBreakProperty.txt @@ -1,5 +1,5 @@ -# GraphemeBreakProperty-15.1.0.txt -# Date: 2023-01-05, 20:34:41 GMT +# GraphemeBreakProperty-16.0.0.txt +# Date: 2023-10-13, 11:29:23 GMT # © 2023 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see https://www.unicode.org/terms_of_use.html @@ -106,7 +106,7 @@ E01F0..E0FFF ; Control # Cn [3600] .. 0825..0827 ; Extend # Mn [3] SAMARITAN VOWEL SIGN SHORT A..SAMARITAN VOWEL SIGN U 0829..082D ; Extend # Mn [5] SAMARITAN VOWEL SIGN LONG I..SAMARITAN MARK NEQUDAA 0859..085B ; Extend # Mn [3] MANDAIC AFFRICATION MARK..MANDAIC GEMINATION MARK -0898..089F ; Extend # Mn [8] ARABIC SMALL HIGH WORD AL-JUZ..ARABIC HALF MADDA OVER MADDA +0897..089F ; Extend # Mn [9] ARABIC PEPET..ARABIC HALF MADDA OVER MADDA 08CA..08E1 ; Extend # Mn [24] ARABIC SMALL HIGH FARSI YEH..ARABIC SMALL HIGH SIGN SAFHA 08E3..0902 ; Extend # Mn [32] ARABIC TURNED DAMMA BELOW..DEVANAGARI SIGN ANUSVARA 093A ; Extend # Mn DEVANAGARI VOWEL SIGN OE @@ -325,7 +325,7 @@ FF9E..FF9F ; Extend # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDT 10AE5..10AE6 ; Extend # Mn [2] MANICHAEAN ABBREVIATION MARK ABOVE..MANICHAEAN ABBREVIATION MARK BELOW 10D24..10D27 ; Extend # Mn [4] HANIFI ROHINGYA SIGN HARBAHAY..HANIFI ROHINGYA SIGN TASSI 10EAB..10EAC ; Extend # Mn [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK -10EFD..10EFF ; Extend # Mn [3] ARABIC SMALL LOW WORD SAKTA..ARABIC SMALL LOW WORD MADDA +10EFC..10EFF ; Extend # Mn [4] ARABIC COMBINING ALEF OVERLAY..ARABIC SMALL LOW WORD MADDA 10F46..10F50 ; Extend # Mn [11] SOGDIAN COMBINING DOT BELOW..SOGDIAN COMBINING STROKE BELOW 10F82..10F85 ; Extend # Mn [4] OLD UYGHUR COMBINING DOT ABOVE..OLD UYGHUR COMBINING TWO DOTS BELOW 11001 ; Extend # Mn BRAHMI SIGN ANUSVARA @@ -459,7 +459,7 @@ FF9E..FF9F ; Extend # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDT E0020..E007F ; Extend # Cf [96] TAG SPACE..CANCEL TAG E0100..E01EF ; Extend # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256 -# Total code points: 2130 +# Total code points: 2132 # ================================================ diff --git a/unicodetools/data/ucd/dev/auxiliary/SentenceBreakProperty.txt b/unicodetools/data/ucd/dev/auxiliary/SentenceBreakProperty.txt index d4a6b8685..03b5da341 100644 --- a/unicodetools/data/ucd/dev/auxiliary/SentenceBreakProperty.txt +++ b/unicodetools/data/ucd/dev/auxiliary/SentenceBreakProperty.txt @@ -1,5 +1,5 @@ # SentenceBreakProperty-16.0.0.txt -# Date: 2023-10-02, 12:52:00 GMT +# Date: 2023-10-13, 15:52:55 GMT # © 2023 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see https://www.unicode.org/terms_of_use.html @@ -55,7 +55,7 @@ 0825..0827 ; Extend # Mn [3] SAMARITAN VOWEL SIGN SHORT A..SAMARITAN VOWEL SIGN U 0829..082D ; Extend # Mn [5] SAMARITAN VOWEL SIGN LONG I..SAMARITAN MARK NEQUDAA 0859..085B ; Extend # Mn [3] MANDAIC AFFRICATION MARK..MANDAIC GEMINATION MARK -0898..089F ; Extend # Mn [8] ARABIC SMALL HIGH WORD AL-JUZ..ARABIC HALF MADDA OVER MADDA +0897..089F ; Extend # Mn [9] ARABIC PEPET..ARABIC HALF MADDA OVER MADDA 08CA..08E1 ; Extend # Mn [24] ARABIC SMALL HIGH FARSI YEH..ARABIC SMALL HIGH SIGN SAFHA 08E3..0902 ; Extend # Mn [32] ARABIC TURNED DAMMA BELOW..DEVANAGARI SIGN ANUSVARA 0903 ; Extend # Mc DEVANAGARI SIGN VISARGA @@ -372,7 +372,7 @@ FF9E..FF9F ; Extend # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDT 10AE5..10AE6 ; Extend # Mn [2] MANICHAEAN ABBREVIATION MARK ABOVE..MANICHAEAN ABBREVIATION MARK BELOW 10D24..10D27 ; Extend # Mn [4] HANIFI ROHINGYA SIGN HARBAHAY..HANIFI ROHINGYA SIGN TASSI 10EAB..10EAC ; Extend # Mn [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK -10EFD..10EFF ; Extend # Mn [3] ARABIC SMALL LOW WORD SAKTA..ARABIC SMALL LOW WORD MADDA +10EFC..10EFF ; Extend # Mn [4] ARABIC COMBINING ALEF OVERLAY..ARABIC SMALL LOW WORD MADDA 10F46..10F50 ; Extend # Mn [11] SOGDIAN COMBINING DOT BELOW..SOGDIAN COMBINING STROKE BELOW 10F82..10F85 ; Extend # Mn [4] OLD UYGHUR COMBINING DOT ABOVE..OLD UYGHUR COMBINING TWO DOTS BELOW 11000 ; Extend # Mc BRAHMI SIGN CANDRABINDU @@ -567,7 +567,7 @@ FF9E..FF9F ; Extend # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDT E0020..E007F ; Extend # Cf [96] TAG SPACE..CANCEL TAG E0100..E01EF ; Extend # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256 -# Total code points: 2550 +# Total code points: 2552 # ================================================ @@ -892,6 +892,7 @@ E0001 ; Format # Cf LANGUAGE TAG 10FC ; Lower # Lm MODIFIER LETTER GEORGIAN NAR 13F8..13FD ; Lower # L& [6] CHEROKEE SMALL LETTER YE..CHEROKEE SMALL LETTER MV 1C80..1C88 ; Lower # L& [9] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER UNBLENDED UK +1C8A ; Lower # L& CYRILLIC SMALL LETTER TJE 1D00..1D2B ; Lower # L& [44] LATIN LETTER SMALL CAPITAL A..CYRILLIC LETTER SMALL CAPITAL EL 1D2C..1D6A ; Lower # Lm [63] MODIFIER LETTER CAPITAL A..GREEK SUBSCRIPT SMALL LETTER CHI 1D6B..1D77 ; Lower # L& [13] LATIN SMALL LETTER UE..LATIN SMALL LETTER TURNED G @@ -1297,7 +1298,7 @@ FF41..FF5A ; Lower # L& [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN 1E030..1E06D ; Lower # Lm [62] MODIFIER LETTER CYRILLIC SMALL A..MODIFIER LETTER CYRILLIC SMALL STRAIGHT U WITH STROKE 1E922..1E943 ; Lower # L& [34] ADLAM SMALL LETTER ALIF..ADLAM SMALL LETTER SHA -# Total code points: 2497 +# Total code points: 2498 # ================================================ @@ -1577,6 +1578,7 @@ FF41..FF5A ; Lower # L& [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN 10C7 ; Upper # L& GEORGIAN CAPITAL LETTER YN 10CD ; Upper # L& GEORGIAN CAPITAL LETTER AEN 13A0..13F5 ; Upper # L& [86] CHEROKEE LETTER A..CHEROKEE LETTER MV +1C89 ; Upper # L& CYRILLIC CAPITAL LETTER TJE 1E00 ; Upper # L& LATIN CAPITAL LETTER A WITH RING BELOW 1E02 ; Upper # L& LATIN CAPITAL LETTER B WITH DOT ABOVE 1E04 ; Upper # L& LATIN CAPITAL LETTER B WITH DOT BELOW @@ -1954,7 +1956,7 @@ FF21..FF3A ; Upper # L& [26] FULLWIDTH LATIN CAPITAL LETTER A..FULLWIDTH LAT 1F150..1F169 ; Upper # So [26] NEGATIVE CIRCLED LATIN CAPITAL LETTER A..NEGATIVE CIRCLED LATIN CAPITAL LETTER Z 1F170..1F189 ; Upper # So [26] NEGATIVE SQUARED LATIN CAPITAL LETTER A..NEGATIVE SQUARED LATIN CAPITAL LETTER Z -# Total code points: 1936 +# Total code points: 1937 # ================================================ @@ -2355,6 +2357,7 @@ FFDA..FFDC ; OLetter # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL 10D00..10D23 ; OLetter # Lo [36] HANIFI ROHINGYA LETTER A..HANIFI ROHINGYA MARK NA KHONNA 10E80..10EA9 ; OLetter # Lo [42] YEZIDI LETTER ELIF..YEZIDI LETTER ET 10EB0..10EB1 ; OLetter # Lo [2] YEZIDI LETTER LAM WITH DOT ABOVE..YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE +10EC2..10EC4 ; OLetter # Lo [3] ARABIC LETTER DAL WITH TWO DOTS VERTICALLY BELOW..ARABIC LETTER KAF WITH TWO DOTS VERTICALLY BELOW 10F00..10F1C ; OLetter # Lo [29] OLD SOGDIAN LETTER ALEPH..OLD SOGDIAN LETTER FINAL TAW WITH VERTICAL TAIL 10F27 ; OLetter # Lo OLD SOGDIAN LIGATURE AYIN-DALETH 10F30..10F45 ; OLetter # Lo [22] SOGDIAN LETTER ALEPH..SOGDIAN INDEPENDENT SHIN @@ -2426,6 +2429,7 @@ FFDA..FFDC ; OLetter # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL 11A5C..11A89 ; OLetter # Lo [46] SOYOMBO LETTER KA..SOYOMBO CLUSTER-INITIAL LETTER SA 11A9D ; OLetter # Lo SOYOMBO MARK PLUTA 11AB0..11AF8 ; OLetter # Lo [73] CANADIAN SYLLABICS NATTILIK HI..PAU CIN HAU GLOTTAL STOP FINAL +11BC0..11BE0 ; OLetter # Lo [33] SUNUWAR LETTER DEVI..SUNUWAR LETTER KLOKO 11C00..11C08 ; OLetter # Lo [9] BHAIKSUKI LETTER A..BHAIKSUKI LETTER VOCALIC L 11C0A..11C2E ; OLetter # Lo [37] BHAIKSUKI LETTER E..BHAIKSUKI LETTER HA 11C40 ; OLetter # Lo BHAIKSUKI SIGN AVAGRAHA @@ -2536,7 +2540,7 @@ FFDA..FFDC ; OLetter # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL 30000..3134A ; OLetter # Lo [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A 31350..323AF ; OLetter # Lo [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF -# Total code points: 132660 +# Total code points: 132696 # ================================================ @@ -2598,6 +2602,7 @@ FF10..FF19 ; Numeric # Nd [10] FULLWIDTH DIGIT ZERO..FULLWIDTH DIGIT NINE 11730..11739 ; Numeric # Nd [10] AHOM DIGIT ZERO..AHOM DIGIT NINE 118E0..118E9 ; Numeric # Nd [10] WARANG CITI DIGIT ZERO..WARANG CITI DIGIT NINE 11950..11959 ; Numeric # Nd [10] DIVES AKURU DIGIT ZERO..DIVES AKURU DIGIT NINE +11BF0..11BF9 ; Numeric # Nd [10] SUNUWAR DIGIT ZERO..SUNUWAR DIGIT NINE 11C50..11C59 ; Numeric # Nd [10] BHAIKSUKI DIGIT ZERO..BHAIKSUKI DIGIT NINE 11D50..11D59 ; Numeric # Nd [10] MASARAM GONDI DIGIT ZERO..MASARAM GONDI DIGIT NINE 11DA0..11DA9 ; Numeric # Nd [10] GUNJALA GONDI DIGIT ZERO..GUNJALA GONDI DIGIT NINE @@ -2612,7 +2617,7 @@ FF10..FF19 ; Numeric # Nd [10] FULLWIDTH DIGIT ZERO..FULLWIDTH DIGIT NINE 1E950..1E959 ; Numeric # Nd [10] ADLAM DIGIT ZERO..ADLAM DIGIT NINE 1FBF0..1FBF9 ; Numeric # Nd [10] SEGMENTED DIGIT ZERO..SEGMENTED DIGIT NINE -# Total code points: 694 +# Total code points: 704 # ================================================ diff --git a/unicodetools/data/ucd/dev/auxiliary/WordBreakProperty.txt b/unicodetools/data/ucd/dev/auxiliary/WordBreakProperty.txt index aa9b6181f..d24f06dc8 100644 --- a/unicodetools/data/ucd/dev/auxiliary/WordBreakProperty.txt +++ b/unicodetools/data/ucd/dev/auxiliary/WordBreakProperty.txt @@ -1,5 +1,5 @@ # WordBreakProperty-16.0.0.txt -# Date: 2023-10-02, 12:52:03 GMT +# Date: 2023-10-13, 15:52:57 GMT # © 2023 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see https://www.unicode.org/terms_of_use.html @@ -91,7 +91,7 @@ FB46..FB4F ; Hebrew_Letter # Lo [10] HEBREW LETTER TSADI WITH DAGESH..HEBREW 0825..0827 ; Extend # Mn [3] SAMARITAN VOWEL SIGN SHORT A..SAMARITAN VOWEL SIGN U 0829..082D ; Extend # Mn [5] SAMARITAN VOWEL SIGN LONG I..SAMARITAN MARK NEQUDAA 0859..085B ; Extend # Mn [3] MANDAIC AFFRICATION MARK..MANDAIC GEMINATION MARK -0898..089F ; Extend # Mn [8] ARABIC SMALL HIGH WORD AL-JUZ..ARABIC HALF MADDA OVER MADDA +0897..089F ; Extend # Mn [9] ARABIC PEPET..ARABIC HALF MADDA OVER MADDA 08CA..08E1 ; Extend # Mn [24] ARABIC SMALL HIGH FARSI YEH..ARABIC SMALL HIGH SIGN SAFHA 08E3..0902 ; Extend # Mn [32] ARABIC TURNED DAMMA BELOW..DEVANAGARI SIGN ANUSVARA 0903 ; Extend # Mc DEVANAGARI SIGN VISARGA @@ -408,7 +408,7 @@ FF9E..FF9F ; Extend # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDT 10AE5..10AE6 ; Extend # Mn [2] MANICHAEAN ABBREVIATION MARK ABOVE..MANICHAEAN ABBREVIATION MARK BELOW 10D24..10D27 ; Extend # Mn [4] HANIFI ROHINGYA SIGN HARBAHAY..HANIFI ROHINGYA SIGN TASSI 10EAB..10EAC ; Extend # Mn [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK -10EFD..10EFF ; Extend # Mn [3] ARABIC SMALL LOW WORD SAKTA..ARABIC SMALL LOW WORD MADDA +10EFC..10EFF ; Extend # Mn [4] ARABIC COMBINING ALEF OVERLAY..ARABIC SMALL LOW WORD MADDA 10F46..10F50 ; Extend # Mn [11] SOGDIAN COMBINING DOT BELOW..SOGDIAN COMBINING STROKE BELOW 10F82..10F85 ; Extend # Mn [4] OLD UYGHUR COMBINING DOT ABOVE..OLD UYGHUR COMBINING TWO DOTS BELOW 11000 ; Extend # Mc BRAHMI SIGN CANDRABINDU @@ -604,7 +604,7 @@ FF9E..FF9F ; Extend # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDT E0020..E007F ; Extend # Cf [96] TAG SPACE..CANCEL TAG E0100..E01EF ; Extend # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256 -# Total code points: 2554 +# Total code points: 2556 # ================================================ @@ -877,7 +877,7 @@ FF71..FF9D ; Katakana # Lo [45] HALFWIDTH KATAKANA LETTER A..HALFWIDTH KATAK 1C4D..1C4F ; ALetter # Lo [3] LEPCHA LETTER TTA..LEPCHA LETTER DDA 1C5A..1C77 ; ALetter # Lo [30] OL CHIKI LETTER LA..OL CHIKI LETTER OH 1C78..1C7D ; ALetter # Lm [6] OL CHIKI MU TTUDDAG..OL CHIKI AHAD -1C80..1C88 ; ALetter # L& [9] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER UNBLENDED UK +1C80..1C8A ; ALetter # L& [11] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER TJE 1C90..1CBA ; ALetter # L& [43] GEORGIAN MTAVRULI CAPITAL LETTER AN..GEORGIAN MTAVRULI CAPITAL LETTER AIN 1CBD..1CBF ; ALetter # L& [3] GEORGIAN MTAVRULI CAPITAL LETTER AEN..GEORGIAN MTAVRULI CAPITAL LETTER LABIAL SIGN 1CE9..1CEC ; ALetter # Lo [4] VEDIC SIGN ANUSVARA ANTARGOMUKHA..VEDIC SIGN ANUSVARA VAMAGOMUKHA WITH TAIL @@ -1115,6 +1115,7 @@ FFDA..FFDC ; ALetter # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL 10D00..10D23 ; ALetter # Lo [36] HANIFI ROHINGYA LETTER A..HANIFI ROHINGYA MARK NA KHONNA 10E80..10EA9 ; ALetter # Lo [42] YEZIDI LETTER ELIF..YEZIDI LETTER ET 10EB0..10EB1 ; ALetter # Lo [2] YEZIDI LETTER LAM WITH DOT ABOVE..YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE +10EC2..10EC4 ; ALetter # Lo [3] ARABIC LETTER DAL WITH TWO DOTS VERTICALLY BELOW..ARABIC LETTER KAF WITH TWO DOTS VERTICALLY BELOW 10F00..10F1C ; ALetter # Lo [29] OLD SOGDIAN LETTER ALEPH..OLD SOGDIAN LETTER FINAL TAW WITH VERTICAL TAIL 10F27 ; ALetter # Lo OLD SOGDIAN LIGATURE AYIN-DALETH 10F30..10F45 ; ALetter # Lo [22] SOGDIAN LETTER ALEPH..SOGDIAN INDEPENDENT SHIN @@ -1185,6 +1186,7 @@ FFDA..FFDC ; ALetter # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL 11A5C..11A89 ; ALetter # Lo [46] SOYOMBO LETTER KA..SOYOMBO CLUSTER-INITIAL LETTER SA 11A9D ; ALetter # Lo SOYOMBO MARK PLUTA 11AB0..11AF8 ; ALetter # Lo [73] CANADIAN SYLLABICS NATTILIK HI..PAU CIN HAU GLOTTAL STOP FINAL +11BC0..11BE0 ; ALetter # Lo [33] SUNUWAR LETTER DEVI..SUNUWAR LETTER KLOKO 11C00..11C08 ; ALetter # Lo [9] BHAIKSUKI LETTER A..BHAIKSUKI LETTER VOCALIC L 11C0A..11C2E ; ALetter # Lo [37] BHAIKSUKI LETTER E..BHAIKSUKI LETTER HA 11C40 ; ALetter # Lo BHAIKSUKI SIGN AVAGRAHA @@ -1313,7 +1315,7 @@ FFDA..FFDC ; ALetter # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL 1F150..1F169 ; ALetter # So [26] NEGATIVE CIRCLED LATIN CAPITAL LETTER A..NEGATIVE CIRCLED LATIN CAPITAL LETTER Z 1F170..1F189 ; ALetter # So [26] NEGATIVE SQUARED LATIN CAPITAL LETTER A..NEGATIVE SQUARED LATIN CAPITAL LETTER Z -# Total code points: 29492 +# Total code points: 29530 # ================================================ @@ -1420,6 +1422,7 @@ FF10..FF19 ; Numeric # Nd [10] FULLWIDTH DIGIT ZERO..FULLWIDTH DIGIT NINE 11730..11739 ; Numeric # Nd [10] AHOM DIGIT ZERO..AHOM DIGIT NINE 118E0..118E9 ; Numeric # Nd [10] WARANG CITI DIGIT ZERO..WARANG CITI DIGIT NINE 11950..11959 ; Numeric # Nd [10] DIVES AKURU DIGIT ZERO..DIVES AKURU DIGIT NINE +11BF0..11BF9 ; Numeric # Nd [10] SUNUWAR DIGIT ZERO..SUNUWAR DIGIT NINE 11C50..11C59 ; Numeric # Nd [10] BHAIKSUKI DIGIT ZERO..BHAIKSUKI DIGIT NINE 11D50..11D59 ; Numeric # Nd [10] MASARAM GONDI DIGIT ZERO..MASARAM GONDI DIGIT NINE 11DA0..11DA9 ; Numeric # Nd [10] GUNJALA GONDI DIGIT ZERO..GUNJALA GONDI DIGIT NINE @@ -1434,7 +1437,7 @@ FF10..FF19 ; Numeric # Nd [10] FULLWIDTH DIGIT ZERO..FULLWIDTH DIGIT NINE 1E950..1E959 ; Numeric # Nd [10] ADLAM DIGIT ZERO..ADLAM DIGIT NINE 1FBF0..1FBF9 ; Numeric # Nd [10] SEGMENTED DIGIT ZERO..SEGMENTED DIGIT NINE -# Total code points: 693 +# Total code points: 703 # ================================================ diff --git a/unicodetools/data/ucd/dev/extracted/DerivedBidiClass.txt b/unicodetools/data/ucd/dev/extracted/DerivedBidiClass.txt index 2d669538b..0d6c4ac6f 100644 --- a/unicodetools/data/ucd/dev/extracted/DerivedBidiClass.txt +++ b/unicodetools/data/ucd/dev/extracted/DerivedBidiClass.txt @@ -1,5 +1,5 @@ # DerivedBidiClass-16.0.0.txt -# Date: 2023-10-02, 12:51:27 GMT +# Date: 2023-10-13, 15:52:28 GMT # © 2023 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see https://www.unicode.org/terms_of_use.html @@ -517,7 +517,7 @@ 1C5A..1C77 ; L # Lo [30] OL CHIKI LETTER LA..OL CHIKI LETTER OH 1C78..1C7D ; L # Lm [6] OL CHIKI MU TTUDDAG..OL CHIKI AHAD 1C7E..1C7F ; L # Po [2] OL CHIKI PUNCTUATION MUCAAD..OL CHIKI PUNCTUATION DOUBLE MUCAAD -1C80..1C88 ; L # L& [9] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER UNBLENDED UK +1C80..1C8A ; L # L& [11] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER TJE 1C90..1CBA ; L # L& [43] GEORGIAN MTAVRULI CAPITAL LETTER AN..GEORGIAN MTAVRULI CAPITAL LETTER AIN 1CBD..1CBF ; L # L& [3] GEORGIAN MTAVRULI CAPITAL LETTER AEN..GEORGIAN MTAVRULI CAPITAL LETTER LABIAL SIGN 1CC0..1CC7 ; L # Po [8] SUNDANESE PUNCTUATION BINDU SURYA..SUNDANESE PUNCTUATION BINDU BA SATANGA @@ -991,6 +991,9 @@ FFDA..FFDC ; L # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER 11A9E..11AA2 ; L # Po [5] SOYOMBO HEAD MARK WITH MOON AND SUN AND TRIPLE FLAME..SOYOMBO TERMINAL MARK-2 11AB0..11AF8 ; L # Lo [73] CANADIAN SYLLABICS NATTILIK HI..PAU CIN HAU GLOTTAL STOP FINAL 11B00..11B09 ; L # Po [10] DEVANAGARI HEAD MARK..DEVANAGARI SIGN MINDU +11BC0..11BE0 ; L # Lo [33] SUNUWAR LETTER DEVI..SUNUWAR LETTER KLOKO +11BE1 ; L # Po SUNUWAR SIGN PVO +11BF0..11BF9 ; L # Nd [10] SUNUWAR DIGIT ZERO..SUNUWAR DIGIT NINE 11C00..11C08 ; L # Lo [9] BHAIKSUKI LETTER A..BHAIKSUKI LETTER VOCALIC L 11C0A..11C2E ; L # Lo [37] BHAIKSUKI LETTER E..BHAIKSUKI LETTER HA 11C2F ; L # Mc BHAIKSUKI VOWEL SIGN AA @@ -1182,7 +1185,7 @@ FFDA..FFDC ; L # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER F0000..FFFFD ; L # Co [65534] .. 100000..10FFFD; L # Co [65534] .. -# The above property value applies to 820460 code points not listed here. +# The above property value applies to 820414 code points not listed here. # Total code points: 1096267 # ================================================ @@ -2023,7 +2026,7 @@ FFFFE..FFFFF ; BN # Cn [2] .. 0825..0827 ; NSM # Mn [3] SAMARITAN VOWEL SIGN SHORT A..SAMARITAN VOWEL SIGN U 0829..082D ; NSM # Mn [5] SAMARITAN VOWEL SIGN LONG I..SAMARITAN MARK NEQUDAA 0859..085B ; NSM # Mn [3] MANDAIC AFFRICATION MARK..MANDAIC GEMINATION MARK -0898..089F ; NSM # Mn [8] ARABIC SMALL HIGH WORD AL-JUZ..ARABIC HALF MADDA OVER MADDA +0897..089F ; NSM # Mn [9] ARABIC PEPET..ARABIC HALF MADDA OVER MADDA 08CA..08E1 ; NSM # Mn [24] ARABIC SMALL HIGH FARSI YEH..ARABIC SMALL HIGH SIGN SAFHA 08E3..0902 ; NSM # Mn [32] ARABIC TURNED DAMMA BELOW..DEVANAGARI SIGN ANUSVARA 093A ; NSM # Mn DEVANAGARI VOWEL SIGN OE @@ -2224,7 +2227,7 @@ FE20..FE2F ; NSM # Mn [16] COMBINING LIGATURE LEFT HALF..COMBINING CYRILLIC 10AE5..10AE6 ; NSM # Mn [2] MANICHAEAN ABBREVIATION MARK ABOVE..MANICHAEAN ABBREVIATION MARK BELOW 10D24..10D27 ; NSM # Mn [4] HANIFI ROHINGYA SIGN HARBAHAY..HANIFI ROHINGYA SIGN TASSI 10EAB..10EAC ; NSM # Mn [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK -10EFD..10EFF ; NSM # Mn [3] ARABIC SMALL LOW WORD SAKTA..ARABIC SMALL LOW WORD MADDA +10EFC..10EFF ; NSM # Mn [4] ARABIC COMBINING ALEF OVERLAY..ARABIC SMALL LOW WORD MADDA 10F46..10F50 ; NSM # Mn [11] SOGDIAN COMBINING DOT BELOW..SOGDIAN COMBINING STROKE BELOW 10F82..10F85 ; NSM # Mn [4] OLD UYGHUR COMBINING DOT ABOVE..OLD UYGHUR COMBINING TWO DOTS BELOW 11001 ; NSM # Mn BRAHMI SIGN ANUSVARA @@ -2348,7 +2351,7 @@ FE20..FE2F ; NSM # Mn [16] COMBINING LIGATURE LEFT HALF..COMBINING CYRILLIC 1E944..1E94A ; NSM # Mn [7] ADLAM ALIF LENGTHENER..ADLAM NUKTA E0100..E01EF ; NSM # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256 -# Total code points: 1993 +# Total code points: 1995 # ================================================ @@ -2395,6 +2398,7 @@ FDFC ; AL # Sc RIAL SIGN FE70..FE74 ; AL # Lo [5] ARABIC FATHATAN ISOLATED FORM..ARABIC KASRATAN ISOLATED FORM FE76..FEFC ; AL # Lo [135] ARABIC FATHA ISOLATED FORM..ARABIC LIGATURE LAM WITH ALEF FINAL FORM 10D00..10D23 ; AL # Lo [36] HANIFI ROHINGYA LETTER A..HANIFI ROHINGYA MARK NA KHONNA +10EC2..10EC4 ; AL # Lo [3] ARABIC LETTER DAL WITH TWO DOTS VERTICALLY BELOW..ARABIC LETTER KAF WITH TWO DOTS VERTICALLY BELOW 10F30..10F45 ; AL # Lo [22] SOGDIAN LETTER ALEPH..SOGDIAN INDEPENDENT SHIN 10F51..10F54 ; AL # No [4] SOGDIAN NUMBER ONE..SOGDIAN NUMBER ONE HUNDRED 10F55..10F59 ; AL # Po [5] SOGDIAN PUNCTUATION TWO VERTICAL BARS..SOGDIAN PUNCTUATION HALF CIRCLE WITH DOT @@ -2440,8 +2444,8 @@ FE76..FEFC ; AL # Lo [135] ARABIC FATHA ISOLATED FORM..ARABIC LIGATURE LAM WI 1EEA5..1EEA9 ; AL # Lo [5] ARABIC MATHEMATICAL DOUBLE-STRUCK WAW..ARABIC MATHEMATICAL DOUBLE-STRUCK YEH 1EEAB..1EEBB ; AL # Lo [17] ARABIC MATHEMATICAL DOUBLE-STRUCK LAM..ARABIC MATHEMATICAL DOUBLE-STRUCK GHAIN -# The above property value applies to 298 code points not listed here. -# Total code points: 1769 +# The above property value applies to 293 code points not listed here. +# Total code points: 1767 # ================================================ diff --git a/unicodetools/data/ucd/dev/extracted/DerivedCombiningClass.txt b/unicodetools/data/ucd/dev/extracted/DerivedCombiningClass.txt index e3339802a..23929b30f 100644 --- a/unicodetools/data/ucd/dev/extracted/DerivedCombiningClass.txt +++ b/unicodetools/data/ucd/dev/extracted/DerivedCombiningClass.txt @@ -1,5 +1,5 @@ # DerivedCombiningClass-16.0.0.txt -# Date: 2023-10-02, 12:51:29 GMT +# Date: 2023-10-13, 15:52:30 GMT # © 2023 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see https://www.unicode.org/terms_of_use.html @@ -664,7 +664,7 @@ 1C5A..1C77 ; 0 # Lo [30] OL CHIKI LETTER LA..OL CHIKI LETTER OH 1C78..1C7D ; 0 # Lm [6] OL CHIKI MU TTUDDAG..OL CHIKI AHAD 1C7E..1C7F ; 0 # Po [2] OL CHIKI PUNCTUATION MUCAAD..OL CHIKI PUNCTUATION DOUBLE MUCAAD -1C80..1C88 ; 0 # L& [9] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER UNBLENDED UK +1C80..1C8A ; 0 # L& [11] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER TJE 1C90..1CBA ; 0 # L& [43] GEORGIAN MTAVRULI CAPITAL LETTER AN..GEORGIAN MTAVRULI CAPITAL LETTER AIN 1CBD..1CBF ; 0 # L& [3] GEORGIAN MTAVRULI CAPITAL LETTER AEN..GEORGIAN MTAVRULI CAPITAL LETTER LABIAL SIGN 1CC0..1CC7 ; 0 # Po [8] SUNDANESE PUNCTUATION BINDU SURYA..SUNDANESE PUNCTUATION BINDU BA SATANGA @@ -1463,6 +1463,8 @@ FFFC..FFFD ; 0 # So [2] OBJECT REPLACEMENT CHARACTER..REPLACEMENT CHARACTER 10E80..10EA9 ; 0 # Lo [42] YEZIDI LETTER ELIF..YEZIDI LETTER ET 10EAD ; 0 # Pd YEZIDI HYPHENATION MARK 10EB0..10EB1 ; 0 # Lo [2] YEZIDI LETTER LAM WITH DOT ABOVE..YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE +10EC2..10EC4 ; 0 # Lo [3] ARABIC LETTER DAL WITH TWO DOTS VERTICALLY BELOW..ARABIC LETTER KAF WITH TWO DOTS VERTICALLY BELOW +10EFC ; 0 # Mn ARABIC COMBINING ALEF OVERLAY 10F00..10F1C ; 0 # Lo [29] OLD SOGDIAN LETTER ALEPH..OLD SOGDIAN LETTER FINAL TAW WITH VERTICAL TAIL 10F1D..10F26 ; 0 # No [10] OLD SOGDIAN NUMBER ONE..OLD SOGDIAN FRACTION ONE HALF 10F27 ; 0 # Lo OLD SOGDIAN LIGATURE AYIN-DALETH @@ -1687,6 +1689,9 @@ FFFC..FFFD ; 0 # So [2] OBJECT REPLACEMENT CHARACTER..REPLACEMENT CHARACTER 11A9E..11AA2 ; 0 # Po [5] SOYOMBO HEAD MARK WITH MOON AND SUN AND TRIPLE FLAME..SOYOMBO TERMINAL MARK-2 11AB0..11AF8 ; 0 # Lo [73] CANADIAN SYLLABICS NATTILIK HI..PAU CIN HAU GLOTTAL STOP FINAL 11B00..11B09 ; 0 # Po [10] DEVANAGARI HEAD MARK..DEVANAGARI SIGN MINDU +11BC0..11BE0 ; 0 # Lo [33] SUNUWAR LETTER DEVI..SUNUWAR LETTER KLOKO +11BE1 ; 0 # Po SUNUWAR SIGN PVO +11BF0..11BF9 ; 0 # Nd [10] SUNUWAR DIGIT ZERO..SUNUWAR DIGIT NINE 11C00..11C08 ; 0 # Lo [9] BHAIKSUKI LETTER A..BHAIKSUKI LETTER VOCALIC L 11C0A..11C2E ; 0 # Lo [37] BHAIKSUKI LETTER E..BHAIKSUKI LETTER HA 11C2F ; 0 # Mc BHAIKSUKI VOWEL SIGN AA @@ -2005,8 +2010,8 @@ E0100..E01EF ; 0 # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256 F0000..FFFFD ; 0 # Co [65534] .. 100000..10FFFD; 0 # Co [65534] .. -# The above property value applies to 826764 code points not listed here. -# Total code points: 1113190 +# The above property value applies to 826713 code points not listed here. +# Total code points: 1113189 # ================================================ @@ -2652,7 +2657,7 @@ FE27..FE2D ; 220 # Mn [7] COMBINING LIGATURE LEFT HALF BELOW..COMBINING CON 081B..0823 ; 230 # Mn [9] SAMARITAN MARK EPENTHETIC YUT..SAMARITAN VOWEL SIGN A 0825..0827 ; 230 # Mn [3] SAMARITAN VOWEL SIGN SHORT A..SAMARITAN VOWEL SIGN U 0829..082D ; 230 # Mn [5] SAMARITAN VOWEL SIGN LONG I..SAMARITAN MARK NEQUDAA -0898 ; 230 # Mn ARABIC SMALL HIGH WORD AL-JUZ +0897..0898 ; 230 # Mn [2] ARABIC PEPET..ARABIC SMALL HIGH WORD AL-JUZ 089C..089F ; 230 # Mn [4] ARABIC MADDA WAAJIB..ARABIC HALF MADDA OVER MADDA 08CA..08CE ; 230 # Mn [5] ARABIC SMALL HIGH FARSI YEH..ARABIC LARGE ROUND DOT ABOVE 08D4..08E1 ; 230 # Mn [14] ARABIC SMALL HIGH WORD AR-RUB..ARABIC SMALL HIGH SIGN SAFHA @@ -2741,7 +2746,7 @@ FE2E..FE2F ; 230 # Mn [2] COMBINING CYRILLIC TITLO LEFT HALF..COMBINING CYR 1E4EF ; 230 # Mn NAG MUNDARI SIGN SUTUH 1E944..1E949 ; 230 # Mn [6] ADLAM ALIF LENGTHENER..ADLAM GEMINATE CONSONANT MODIFIER -# Total code points: 510 +# Total code points: 511 # ================================================ diff --git a/unicodetools/data/ucd/dev/extracted/DerivedEastAsianWidth.txt b/unicodetools/data/ucd/dev/extracted/DerivedEastAsianWidth.txt index c21200371..c82890ade 100644 --- a/unicodetools/data/ucd/dev/extracted/DerivedEastAsianWidth.txt +++ b/unicodetools/data/ucd/dev/extracted/DerivedEastAsianWidth.txt @@ -1,5 +1,5 @@ # DerivedEastAsianWidth-16.0.0.txt -# Date: 2023-10-02, 12:51:32 GMT +# Date: 2023-10-13, 15:52:32 GMT # © 2023 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see https://www.unicode.org/terms_of_use.html @@ -225,7 +225,7 @@ 0888 ; N # Sk ARABIC RAISED ROUND DOT 0889..088E ; N # Lo [6] ARABIC LETTER NOON WITH INVERTED SMALL V..ARABIC VERTICAL TAIL 0890..0891 ; N # Cf [2] ARABIC POUND MARK ABOVE..ARABIC PIASTRE MARK ABOVE -0898..089F ; N # Mn [8] ARABIC SMALL HIGH WORD AL-JUZ..ARABIC HALF MADDA OVER MADDA +0897..089F ; N # Mn [9] ARABIC PEPET..ARABIC HALF MADDA OVER MADDA 08A0..08C8 ; N # Lo [41] ARABIC LETTER BEH WITH SMALL V BELOW..ARABIC LETTER GRAF 08C9 ; N # Lm ARABIC SMALL FARSI YEH 08CA..08E1 ; N # Mn [24] ARABIC SMALL HIGH FARSI YEH..ARABIC SMALL HIGH SIGN SAFHA @@ -743,7 +743,7 @@ 1C5A..1C77 ; N # Lo [30] OL CHIKI LETTER LA..OL CHIKI LETTER OH 1C78..1C7D ; N # Lm [6] OL CHIKI MU TTUDDAG..OL CHIKI AHAD 1C7E..1C7F ; N # Po [2] OL CHIKI PUNCTUATION MUCAAD..OL CHIKI PUNCTUATION DOUBLE MUCAAD -1C80..1C88 ; N # L& [9] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER UNBLENDED UK +1C80..1C8A ; N # L& [11] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER TJE 1C90..1CBA ; N # L& [43] GEORGIAN MTAVRULI CAPITAL LETTER AN..GEORGIAN MTAVRULI CAPITAL LETTER AIN 1CBD..1CBF ; N # L& [3] GEORGIAN MTAVRULI CAPITAL LETTER AEN..GEORGIAN MTAVRULI CAPITAL LETTER LABIAL SIGN 1CC0..1CC7 ; N # Po [8] SUNDANESE PUNCTUATION BINDU SURYA..SUNDANESE PUNCTUATION BINDU BA SATANGA @@ -1472,7 +1472,8 @@ FFFC ; N # So OBJECT REPLACEMENT CHARACTER 10EAB..10EAC ; N # Mn [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK 10EAD ; N # Pd YEZIDI HYPHENATION MARK 10EB0..10EB1 ; N # Lo [2] YEZIDI LETTER LAM WITH DOT ABOVE..YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE -10EFD..10EFF ; N # Mn [3] ARABIC SMALL LOW WORD SAKTA..ARABIC SMALL LOW WORD MADDA +10EC2..10EC4 ; N # Lo [3] ARABIC LETTER DAL WITH TWO DOTS VERTICALLY BELOW..ARABIC LETTER KAF WITH TWO DOTS VERTICALLY BELOW +10EFC..10EFF ; N # Mn [4] ARABIC COMBINING ALEF OVERLAY..ARABIC SMALL LOW WORD MADDA 10F00..10F1C ; N # Lo [29] OLD SOGDIAN LETTER ALEPH..OLD SOGDIAN LETTER FINAL TAW WITH VERTICAL TAIL 10F1D..10F26 ; N # No [10] OLD SOGDIAN NUMBER ONE..OLD SOGDIAN FRACTION ONE HALF 10F27 ; N # Lo OLD SOGDIAN LIGATURE AYIN-DALETH @@ -1717,6 +1718,9 @@ FFFC ; N # So OBJECT REPLACEMENT CHARACTER 11A9E..11AA2 ; N # Po [5] SOYOMBO HEAD MARK WITH MOON AND SUN AND TRIPLE FLAME..SOYOMBO TERMINAL MARK-2 11AB0..11AF8 ; N # Lo [73] CANADIAN SYLLABICS NATTILIK HI..PAU CIN HAU GLOTTAL STOP FINAL 11B00..11B09 ; N # Po [10] DEVANAGARI HEAD MARK..DEVANAGARI SIGN MINDU +11BC0..11BE0 ; N # Lo [33] SUNUWAR LETTER DEVI..SUNUWAR LETTER KLOKO +11BE1 ; N # Po SUNUWAR SIGN PVO +11BF0..11BF9 ; N # Nd [10] SUNUWAR DIGIT ZERO..SUNUWAR DIGIT NINE 11C00..11C08 ; N # Lo [9] BHAIKSUKI LETTER A..BHAIKSUKI LETTER VOCALIC L 11C0A..11C2E ; N # Lo [37] BHAIKSUKI LETTER E..BHAIKSUKI LETTER HA 11C2F ; N # Mc BHAIKSUKI VOWEL SIGN AA @@ -2043,7 +2047,7 @@ FFFC ; N # So OBJECT REPLACEMENT CHARACTER E0001 ; N # Cf LANGUAGE TAG E0020..E007F ; N # Cf [96] TAG SPACE..CANCEL TAG -# The above property value applies to 766282 code points not listed here. +# The above property value applies to 766231 code points not listed here. # Total code points: 792618 # ================================================ diff --git a/unicodetools/data/ucd/dev/extracted/DerivedGeneralCategory.txt b/unicodetools/data/ucd/dev/extracted/DerivedGeneralCategory.txt index 324b26b9d..e2a1f7ee6 100644 --- a/unicodetools/data/ucd/dev/extracted/DerivedGeneralCategory.txt +++ b/unicodetools/data/ucd/dev/extracted/DerivedGeneralCategory.txt @@ -1,5 +1,5 @@ # DerivedGeneralCategory-16.0.0.txt -# Date: 2023-10-02, 12:51:33 GMT +# Date: 2023-10-13, 15:52:32 GMT # © 2023 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see https://www.unicode.org/terms_of_use.html @@ -37,7 +37,7 @@ 085F ; Cn # 086B..086F ; Cn # [5] .. 088F ; Cn # -0892..0897 ; Cn # [6] .. +0892..0896 ; Cn # [5] .. 0984 ; Cn # 098D..098E ; Cn # [2] .. 0991..0992 ; Cn # [2] .. @@ -234,7 +234,7 @@ 1BF4..1BFB ; Cn # [8] .. 1C38..1C3A ; Cn # [3] .. 1C4A..1C4C ; Cn # [3] .. -1C89..1C8F ; Cn # [7] .. +1C8B..1C8F ; Cn # [5] .. 1CBB..1CBC ; Cn # [2] .. 1CC8..1CCF ; Cn # [8] .. 1CFB..1CFF ; Cn # [5] .. @@ -435,7 +435,8 @@ FFFE..FFFF ; Cn # [2] .. 10E7F ; Cn # 10EAA ; Cn # 10EAE..10EAF ; Cn # [2] .. -10EB2..10EFC ; Cn # [75] .. +10EB2..10EC1 ; Cn # [16] .. +10EC5..10EFB ; Cn # [55] .. 10F28..10F2F ; Cn # [8] .. 10F5A..10F6F ; Cn # [22] .. 10F8A..10FAF ; Cn # [38] .. @@ -506,7 +507,9 @@ FFFE..FFFF ; Cn # [2] .. 11A48..11A4F ; Cn # [8] .. 11AA3..11AAF ; Cn # [13] .. 11AF9..11AFF ; Cn # [7] .. -11B0A..11BFF ; Cn # [246] .. +11B0A..11BBF ; Cn # [182] .. +11BE2..11BEF ; Cn # [14] .. +11BFA..11BFF ; Cn # [6] .. 11C09 ; Cn # 11C37 ; Cn # 11C46..11C4F ; Cn # [10] .. @@ -723,7 +726,7 @@ E01F0..EFFFF ; Cn # [65040] .. FFFFE..FFFFF ; Cn # [2] .. 10FFFE..10FFFF; Cn # [2] .. -# Total code points: 824716 +# Total code points: 824665 # ================================================ @@ -1005,6 +1008,7 @@ FFFFE..FFFFF ; Cn # [2] .. 10C7 ; Lu # GEORGIAN CAPITAL LETTER YN 10CD ; Lu # GEORGIAN CAPITAL LETTER AEN 13A0..13F5 ; Lu # [86] CHEROKEE LETTER A..CHEROKEE LETTER MV +1C89 ; Lu # CYRILLIC CAPITAL LETTER TJE 1C90..1CBA ; Lu # [43] GEORGIAN MTAVRULI CAPITAL LETTER AN..GEORGIAN MTAVRULI CAPITAL LETTER AIN 1CBD..1CBF ; Lu # [3] GEORGIAN MTAVRULI CAPITAL LETTER AEN..GEORGIAN MTAVRULI CAPITAL LETTER LABIAL SIGN 1E00 ; Lu # LATIN CAPITAL LETTER A WITH RING BELOW @@ -1376,7 +1380,7 @@ FF21..FF3A ; Lu # [26] FULLWIDTH LATIN CAPITAL LETTER A..FULLWIDTH LATIN CAP 1D7CA ; Lu # MATHEMATICAL BOLD CAPITAL DIGAMMA 1E900..1E921 ; Lu # [34] ADLAM CAPITAL LETTER ALIF..ADLAM CAPITAL LETTER SHA -# Total code points: 1831 +# Total code points: 1832 # ================================================ @@ -1656,6 +1660,7 @@ FF21..FF3A ; Lu # [26] FULLWIDTH LATIN CAPITAL LETTER A..FULLWIDTH LATIN CAP 10FD..10FF ; Ll # [3] GEORGIAN LETTER AEN..GEORGIAN LETTER LABIAL SIGN 13F8..13FD ; Ll # [6] CHEROKEE SMALL LETTER YE..CHEROKEE SMALL LETTER MV 1C80..1C88 ; Ll # [9] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER UNBLENDED UK +1C8A ; Ll # CYRILLIC SMALL LETTER TJE 1D00..1D2B ; Ll # [44] LATIN LETTER SMALL CAPITAL A..CYRILLIC LETTER SMALL CAPITAL EL 1D6B..1D77 ; Ll # [13] LATIN SMALL LETTER UE..LATIN SMALL LETTER TURNED G 1D79..1D9A ; Ll # [34] LATIN SMALL LETTER INSULAR G..LATIN SMALL LETTER EZH WITH RETROFLEX HOOK @@ -2041,7 +2046,7 @@ FF41..FF5A ; Ll # [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN SMALL 1DF25..1DF2A ; Ll # [6] LATIN SMALL LETTER D WITH MID-HEIGHT LEFT HOOK..LATIN SMALL LETTER T WITH MID-HEIGHT LEFT HOOK 1E922..1E943 ; Ll # [34] ADLAM SMALL LETTER ALIF..ADLAM SMALL LETTER SHA -# Total code points: 2233 +# Total code points: 2234 # ================================================ @@ -2484,6 +2489,7 @@ FFDA..FFDC ; Lo # [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I 10D00..10D23 ; Lo # [36] HANIFI ROHINGYA LETTER A..HANIFI ROHINGYA MARK NA KHONNA 10E80..10EA9 ; Lo # [42] YEZIDI LETTER ELIF..YEZIDI LETTER ET 10EB0..10EB1 ; Lo # [2] YEZIDI LETTER LAM WITH DOT ABOVE..YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE +10EC2..10EC4 ; Lo # [3] ARABIC LETTER DAL WITH TWO DOTS VERTICALLY BELOW..ARABIC LETTER KAF WITH TWO DOTS VERTICALLY BELOW 10F00..10F1C ; Lo # [29] OLD SOGDIAN LETTER ALEPH..OLD SOGDIAN LETTER FINAL TAW WITH VERTICAL TAIL 10F27 ; Lo # OLD SOGDIAN LIGATURE AYIN-DALETH 10F30..10F45 ; Lo # [22] SOGDIAN LETTER ALEPH..SOGDIAN INDEPENDENT SHIN @@ -2555,6 +2561,7 @@ FFDA..FFDC ; Lo # [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I 11A5C..11A89 ; Lo # [46] SOYOMBO LETTER KA..SOYOMBO CLUSTER-INITIAL LETTER SA 11A9D ; Lo # SOYOMBO MARK PLUTA 11AB0..11AF8 ; Lo # [73] CANADIAN SYLLABICS NATTILIK HI..PAU CIN HAU GLOTTAL STOP FINAL +11BC0..11BE0 ; Lo # [33] SUNUWAR LETTER DEVI..SUNUWAR LETTER KLOKO 11C00..11C08 ; Lo # [9] BHAIKSUKI LETTER A..BHAIKSUKI LETTER VOCALIC L 11C0A..11C2E ; Lo # [37] BHAIKSUKI LETTER E..BHAIKSUKI LETTER HA 11C40 ; Lo # BHAIKSUKI SIGN AVAGRAHA @@ -2654,7 +2661,7 @@ FFDA..FFDC ; Lo # [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I 30000..3134A ; Lo # [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A 31350..323AF ; Lo # [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF -# Total code points: 132236 +# Total code points: 132272 # ================================================ @@ -2684,7 +2691,7 @@ FFDA..FFDC ; Lo # [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I 0825..0827 ; Mn # [3] SAMARITAN VOWEL SIGN SHORT A..SAMARITAN VOWEL SIGN U 0829..082D ; Mn # [5] SAMARITAN VOWEL SIGN LONG I..SAMARITAN MARK NEQUDAA 0859..085B ; Mn # [3] MANDAIC AFFRICATION MARK..MANDAIC GEMINATION MARK -0898..089F ; Mn # [8] ARABIC SMALL HIGH WORD AL-JUZ..ARABIC HALF MADDA OVER MADDA +0897..089F ; Mn # [9] ARABIC PEPET..ARABIC HALF MADDA OVER MADDA 08CA..08E1 ; Mn # [24] ARABIC SMALL HIGH FARSI YEH..ARABIC SMALL HIGH SIGN SAFHA 08E3..0902 ; Mn # [32] ARABIC TURNED DAMMA BELOW..DEVANAGARI SIGN ANUSVARA 093A ; Mn # DEVANAGARI VOWEL SIGN OE @@ -2883,7 +2890,7 @@ FE20..FE2F ; Mn # [16] COMBINING LIGATURE LEFT HALF..COMBINING CYRILLIC TITL 10AE5..10AE6 ; Mn # [2] MANICHAEAN ABBREVIATION MARK ABOVE..MANICHAEAN ABBREVIATION MARK BELOW 10D24..10D27 ; Mn # [4] HANIFI ROHINGYA SIGN HARBAHAY..HANIFI ROHINGYA SIGN TASSI 10EAB..10EAC ; Mn # [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK -10EFD..10EFF ; Mn # [3] ARABIC SMALL LOW WORD SAKTA..ARABIC SMALL LOW WORD MADDA +10EFC..10EFF ; Mn # [4] ARABIC COMBINING ALEF OVERLAY..ARABIC SMALL LOW WORD MADDA 10F46..10F50 ; Mn # [11] SOGDIAN COMBINING DOT BELOW..SOGDIAN COMBINING STROKE BELOW 10F82..10F85 ; Mn # [4] OLD UYGHUR COMBINING DOT ABOVE..OLD UYGHUR COMBINING TWO DOTS BELOW 11001 ; Mn # BRAHMI SIGN ANUSVARA @@ -3007,7 +3014,7 @@ FE20..FE2F ; Mn # [16] COMBINING LIGATURE LEFT HALF..COMBINING CYRILLIC TITL 1E944..1E94A ; Mn # [7] ADLAM ALIF LENGTHENER..ADLAM NUKTA E0100..E01EF ; Mn # [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256 -# Total code points: 1985 +# Total code points: 1987 # ================================================ @@ -3265,6 +3272,7 @@ FF10..FF19 ; Nd # [10] FULLWIDTH DIGIT ZERO..FULLWIDTH DIGIT NINE 11730..11739 ; Nd # [10] AHOM DIGIT ZERO..AHOM DIGIT NINE 118E0..118E9 ; Nd # [10] WARANG CITI DIGIT ZERO..WARANG CITI DIGIT NINE 11950..11959 ; Nd # [10] DIVES AKURU DIGIT ZERO..DIVES AKURU DIGIT NINE +11BF0..11BF9 ; Nd # [10] SUNUWAR DIGIT ZERO..SUNUWAR DIGIT NINE 11C50..11C59 ; Nd # [10] BHAIKSUKI DIGIT ZERO..BHAIKSUKI DIGIT NINE 11D50..11D59 ; Nd # [10] MASARAM GONDI DIGIT ZERO..MASARAM GONDI DIGIT NINE 11DA0..11DA9 ; Nd # [10] GUNJALA GONDI DIGIT ZERO..GUNJALA GONDI DIGIT NINE @@ -3279,7 +3287,7 @@ FF10..FF19 ; Nd # [10] FULLWIDTH DIGIT ZERO..FULLWIDTH DIGIT NINE 1E950..1E959 ; Nd # [10] ADLAM DIGIT ZERO..ADLAM DIGIT NINE 1FBF0..1FBF9 ; Nd # [10] SEGMENTED DIGIT ZERO..SEGMENTED DIGIT NINE -# Total code points: 680 +# Total code points: 690 # ================================================ @@ -3847,6 +3855,7 @@ FF64..FF65 ; Po # [2] HALFWIDTH IDEOGRAPHIC COMMA..HALFWIDTH KATAKANA MIDDL 11A9A..11A9C ; Po # [3] SOYOMBO MARK TSHEG..SOYOMBO MARK DOUBLE SHAD 11A9E..11AA2 ; Po # [5] SOYOMBO HEAD MARK WITH MOON AND SUN AND TRIPLE FLAME..SOYOMBO TERMINAL MARK-2 11B00..11B09 ; Po # [10] DEVANAGARI HEAD MARK..DEVANAGARI SIGN MINDU +11BE1 ; Po # SUNUWAR SIGN PVO 11C41..11C45 ; Po # [5] BHAIKSUKI DANDA..BHAIKSUKI GAP FILLER-2 11C70..11C71 ; Po # [2] MARCHEN HEAD MARK..MARCHEN MARK SHAD 11EF7..11EF8 ; Po # [2] MAKASAR PASSIMBANG..MAKASAR END OF SECTION @@ -3864,7 +3873,7 @@ FF64..FF65 ; Po # [2] HALFWIDTH IDEOGRAPHIC COMMA..HALFWIDTH KATAKANA MIDDL 1DA87..1DA8B ; Po # [5] SIGNWRITING COMMA..SIGNWRITING PARENTHESIS 1E95E..1E95F ; Po # [2] ADLAM INITIAL EXCLAMATION MARK..ADLAM INITIAL QUESTION MARK -# Total code points: 628 +# Total code points: 629 # ================================================ diff --git a/unicodetools/data/ucd/dev/extracted/DerivedJoiningGroup.txt b/unicodetools/data/ucd/dev/extracted/DerivedJoiningGroup.txt index 364847b91..2589107eb 100644 --- a/unicodetools/data/ucd/dev/extracted/DerivedJoiningGroup.txt +++ b/unicodetools/data/ucd/dev/extracted/DerivedJoiningGroup.txt @@ -1,5 +1,5 @@ -# DerivedJoiningGroup-15.1.0.txt -# Date: 2023-01-05, 20:34:37 GMT +# DerivedJoiningGroup-16.0.0.txt +# Date: 2023-10-02, 12:16:28 GMT # © 2023 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see https://www.unicode.org/terms_of_use.html @@ -72,8 +72,9 @@ 06EE ; Dal # Lo ARABIC LETTER DAL WITH INVERTED V 0759..075A ; Dal # Lo [2] ARABIC LETTER DAL WITH TWO DOTS VERTICALLY BELOW AND SMALL TAH..ARABIC LETTER DAL WITH INVERTED SMALL V BELOW 08AE ; Dal # Lo ARABIC LETTER DAL WITH THREE DOTS BELOW +10EC2 ; Dal # Lo ARABIC LETTER DAL WITH TWO DOTS VERTICALLY BELOW -# Total code points: 15 +# Total code points: 16 # ================================================ @@ -177,8 +178,9 @@ 06AC..06AE ; Kaf # Lo [3] ARABIC LETTER KAF WITH DOT ABOVE..ARABIC LETTER KAF WITH THREE DOTS BELOW 077F ; Kaf # Lo ARABIC LETTER KAF WITH TWO DOTS ABOVE 08B4 ; Kaf # Lo ARABIC LETTER KAF WITH DOT BELOW +10EC4 ; Kaf # Lo ARABIC LETTER KAF WITH TWO DOTS VERTICALLY BELOW -# Total code points: 6 +# Total code points: 7 # ================================================ @@ -331,8 +333,9 @@ 069F ; Tah # Lo ARABIC LETTER TAH WITH THREE DOTS ABOVE 088B..088C ; Tah # Lo [2] ARABIC LETTER TAH WITH DOT BELOW..ARABIC LETTER TAH WITH THREE DOTS BELOW 08A3 ; Tah # Lo ARABIC LETTER TAH WITH TWO DOTS ABOVE +10EC3 ; Tah # Lo ARABIC LETTER TAH WITH TWO DOTS VERTICALLY BELOW -# Total code points: 6 +# Total code points: 7 # ================================================ diff --git a/unicodetools/data/ucd/dev/extracted/DerivedJoiningType.txt b/unicodetools/data/ucd/dev/extracted/DerivedJoiningType.txt index a4e01e7d3..ea82c725e 100644 --- a/unicodetools/data/ucd/dev/extracted/DerivedJoiningType.txt +++ b/unicodetools/data/ucd/dev/extracted/DerivedJoiningType.txt @@ -1,5 +1,5 @@ -# DerivedJoiningType-15.1.0.txt -# Date: 2023-01-05, 20:34:38 GMT +# DerivedJoiningType-16.0.0.txt +# Date: 2023-10-13, 11:29:21 GMT # © 2023 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see https://www.unicode.org/terms_of_use.html @@ -95,6 +95,7 @@ A840..A871 ; D # Lo [50] PHAGS-PA LETTER KA..PHAGS-PA SUBJOINED LETTER RA 10BAD..10BAE ; D # No [2] PSALTER PAHLAVI NUMBER TEN..PSALTER PAHLAVI NUMBER TWENTY 10D01..10D21 ; D # Lo [33] HANIFI ROHINGYA LETTER BA..HANIFI ROHINGYA VOWEL O 10D23 ; D # Lo HANIFI ROHINGYA MARK NA KHONNA +10EC3..10EC4 ; D # Lo [2] ARABIC LETTER TAH WITH TWO DOTS VERTICALLY BELOW..ARABIC LETTER KAF WITH TWO DOTS VERTICALLY BELOW 10F30..10F32 ; D # Lo [3] SOGDIAN LETTER ALEPH..SOGDIAN LETTER GIMEL 10F34..10F44 ; D # Lo [17] SOGDIAN LETTER WAW..SOGDIAN LETTER LESH 10F51..10F53 ; D # No [3] SOGDIAN NUMBER ONE..SOGDIAN NUMBER TWENTY @@ -110,7 +111,7 @@ A840..A871 ; D # Lo [50] PHAGS-PA LETTER KA..PHAGS-PA SUBJOINED LETTER RA 10FCA ; D # No CHORASMIAN NUMBER TWENTY 1E900..1E943 ; D # L& [68] ADLAM CAPITAL LETTER ALIF..ADLAM SMALL LETTER SHA -# Total code points: 610 +# Total code points: 612 # ================================================ @@ -173,6 +174,7 @@ A840..A871 ; D # Lo [50] PHAGS-PA LETTER KA..PHAGS-PA SUBJOINED LETTER RA 10B91 ; R # Lo PSALTER PAHLAVI LETTER TAW 10BA9..10BAC ; R # No [4] PSALTER PAHLAVI NUMBER ONE..PSALTER PAHLAVI NUMBER FOUR 10D22 ; R # Lo HANIFI ROHINGYA MARK SAKIN +10EC2 ; R # Lo ARABIC LETTER DAL WITH TWO DOTS VERTICALLY BELOW 10F33 ; R # Lo SOGDIAN LETTER HE 10F54 ; R # No SOGDIAN NUMBER ONE HUNDRED 10F74..10F75 ; R # Lo [2] OLD UYGHUR LETTER ZAYIN..OLD UYGHUR LETTER FINAL HETH @@ -182,7 +184,7 @@ A840..A871 ; D # Lo [50] PHAGS-PA LETTER KA..PHAGS-PA SUBJOINED LETTER RA 10FC2..10FC3 ; R # Lo [2] CHORASMIAN LETTER RESH..CHORASMIAN LETTER SHIN 10FC9 ; R # No CHORASMIAN NUMBER TEN -# Total code points: 152 +# Total code points: 153 # ================================================ @@ -228,7 +230,7 @@ A872 ; L # Lo PHAGS-PA SUPERFIXED LETTER RA 0825..0827 ; T # Mn [3] SAMARITAN VOWEL SIGN SHORT A..SAMARITAN VOWEL SIGN U 0829..082D ; T # Mn [5] SAMARITAN VOWEL SIGN LONG I..SAMARITAN MARK NEQUDAA 0859..085B ; T # Mn [3] MANDAIC AFFRICATION MARK..MANDAIC GEMINATION MARK -0898..089F ; T # Mn [8] ARABIC SMALL HIGH WORD AL-JUZ..ARABIC HALF MADDA OVER MADDA +0897..089F ; T # Mn [9] ARABIC PEPET..ARABIC HALF MADDA OVER MADDA 08CA..08E1 ; T # Mn [24] ARABIC SMALL HIGH FARSI YEH..ARABIC SMALL HIGH SIGN SAFHA 08E3..0902 ; T # Mn [32] ARABIC TURNED DAMMA BELOW..DEVANAGARI SIGN ANUSVARA 093A ; T # Mn DEVANAGARI VOWEL SIGN OE @@ -438,7 +440,7 @@ FFF9..FFFB ; T # Cf [3] INTERLINEAR ANNOTATION ANCHOR..INTERLINEAR ANNOTATI 10AE5..10AE6 ; T # Mn [2] MANICHAEAN ABBREVIATION MARK ABOVE..MANICHAEAN ABBREVIATION MARK BELOW 10D24..10D27 ; T # Mn [4] HANIFI ROHINGYA SIGN HARBAHAY..HANIFI ROHINGYA SIGN TASSI 10EAB..10EAC ; T # Mn [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK -10EFD..10EFF ; T # Mn [3] ARABIC SMALL LOW WORD SAKTA..ARABIC SMALL LOW WORD MADDA +10EFC..10EFF ; T # Mn [4] ARABIC COMBINING ALEF OVERLAY..ARABIC SMALL LOW WORD MADDA 10F46..10F50 ; T # Mn [11] SOGDIAN COMBINING DOT BELOW..SOGDIAN COMBINING STROKE BELOW 10F82..10F85 ; T # Mn [4] OLD UYGHUR COMBINING DOT ABOVE..OLD UYGHUR COMBINING TWO DOTS BELOW 11001 ; T # Mn BRAHMI SIGN ANUSVARA @@ -568,6 +570,6 @@ E0001 ; T # Cf LANGUAGE TAG E0020..E007F ; T # Cf [96] TAG SPACE..CANCEL TAG E0100..E01EF ; T # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256 -# Total code points: 2150 +# Total code points: 2152 # EOF diff --git a/unicodetools/data/ucd/dev/extracted/DerivedLineBreak.txt b/unicodetools/data/ucd/dev/extracted/DerivedLineBreak.txt index d14e519f0..fab5c30c2 100644 --- a/unicodetools/data/ucd/dev/extracted/DerivedLineBreak.txt +++ b/unicodetools/data/ucd/dev/extracted/DerivedLineBreak.txt @@ -1,5 +1,5 @@ # DerivedLineBreak-16.0.0.txt -# Date: 2023-10-02, 12:51:34 GMT +# Date: 2023-10-13, 15:52:33 GMT # © 2023 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see https://www.unicode.org/terms_of_use.html @@ -68,8 +68,8 @@ E000..F8FF ; XX # Co [6400] .. F0000..FFFFD ; XX # Co [65534] .. 100000..10FFFD; XX # Co [65534] .. -# The above property value applies to 762723 code points not listed here. -# Total code points: 900191 +# The above property value applies to 762672 code points not listed here. +# Total code points: 900140 # ================================================ @@ -533,6 +533,7 @@ ABF0..ABF9 ; NU # Nd [10] MEETEI MAYEK DIGIT ZERO..MEETEI MAYEK DIGIT NINE 116C0..116C9 ; NU # Nd [10] TAKRI DIGIT ZERO..TAKRI DIGIT NINE 11730..11739 ; NU # Nd [10] AHOM DIGIT ZERO..AHOM DIGIT NINE 118E0..118E9 ; NU # Nd [10] WARANG CITI DIGIT ZERO..WARANG CITI DIGIT NINE +11BF0..11BF9 ; NU # Nd [10] SUNUWAR DIGIT ZERO..SUNUWAR DIGIT NINE 11C50..11C59 ; NU # Nd [10] BHAIKSUKI DIGIT ZERO..BHAIKSUKI DIGIT NINE 11D50..11D59 ; NU # Nd [10] MASARAM GONDI DIGIT ZERO..MASARAM GONDI DIGIT NINE 11DA0..11DA9 ; NU # Nd [10] GUNJALA GONDI DIGIT ZERO..GUNJALA GONDI DIGIT NINE @@ -546,7 +547,7 @@ ABF0..ABF9 ; NU # Nd [10] MEETEI MAYEK DIGIT ZERO..MEETEI MAYEK DIGIT NINE 1E950..1E959 ; NU # Nd [10] ADLAM DIGIT ZERO..ADLAM DIGIT NINE 1FBF0..1FBF9 ; NU # Nd [10] SEGMENTED DIGIT ZERO..SEGMENTED DIGIT NINE -# Total code points: 624 +# Total code points: 634 # ================================================ @@ -851,7 +852,7 @@ ABF0..ABF9 ; NU # Nd [10] MEETEI MAYEK DIGIT ZERO..MEETEI MAYEK DIGIT NINE 1C4D..1C4F ; AL # Lo [3] LEPCHA LETTER TTA..LEPCHA LETTER DDA 1C5A..1C77 ; AL # Lo [30] OL CHIKI LETTER LA..OL CHIKI LETTER OH 1C78..1C7D ; AL # Lm [6] OL CHIKI MU TTUDDAG..OL CHIKI AHAD -1C80..1C88 ; AL # L& [9] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER UNBLENDED UK +1C80..1C8A ; AL # L& [11] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER TJE 1C90..1CBA ; AL # L& [43] GEORGIAN MTAVRULI CAPITAL LETTER AN..GEORGIAN MTAVRULI CAPITAL LETTER AIN 1CBD..1CBF ; AL # L& [3] GEORGIAN MTAVRULI CAPITAL LETTER AEN..GEORGIAN MTAVRULI CAPITAL LETTER LABIAL SIGN 1CC0..1CC7 ; AL # Po [8] SUNDANESE PUNCTUATION BINDU SURYA..SUNDANESE PUNCTUATION BINDU BA SATANGA @@ -1299,6 +1300,7 @@ FFED..FFEE ; AL # So [2] HALFWIDTH BLACK SQUARE..HALFWIDTH WHITE CIRCLE 10E60..10E7E ; AL # No [31] RUMI DIGIT ONE..RUMI FRACTION TWO THIRDS 10E80..10EA9 ; AL # Lo [42] YEZIDI LETTER ELIF..YEZIDI LETTER ET 10EB0..10EB1 ; AL # Lo [2] YEZIDI LETTER LAM WITH DOT ABOVE..YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE +10EC2..10EC4 ; AL # Lo [3] ARABIC LETTER DAL WITH TWO DOTS VERTICALLY BELOW..ARABIC LETTER KAF WITH TWO DOTS VERTICALLY BELOW 10F00..10F1C ; AL # Lo [29] OLD SOGDIAN LETTER ALEPH..OLD SOGDIAN LETTER FINAL TAW WITH VERTICAL TAIL 10F1D..10F26 ; AL # No [10] OLD SOGDIAN NUMBER ONE..OLD SOGDIAN FRACTION ONE HALF 10F27 ; AL # Lo OLD SOGDIAN LIGATURE AYIN-DALETH @@ -1373,6 +1375,8 @@ FFED..FFEE ; AL # So [2] HALFWIDTH BLACK SQUARE..HALFWIDTH WHITE CIRCLE 11A5C..11A89 ; AL # Lo [46] SOYOMBO LETTER KA..SOYOMBO CLUSTER-INITIAL LETTER SA 11A9D ; AL # Lo SOYOMBO MARK PLUTA 11AB0..11AF8 ; AL # Lo [73] CANADIAN SYLLABICS NATTILIK HI..PAU CIN HAU GLOTTAL STOP FINAL +11BC0..11BE0 ; AL # Lo [33] SUNUWAR LETTER DEVI..SUNUWAR LETTER KLOKO +11BE1 ; AL # Po SUNUWAR SIGN PVO 11C00..11C08 ; AL # Lo [9] BHAIKSUKI LETTER A..BHAIKSUKI LETTER VOCALIC L 11C0A..11C2E ; AL # Lo [37] BHAIKSUKI LETTER E..BHAIKSUKI LETTER HA 11C40 ; AL # Lo BHAIKSUKI SIGN AVAGRAHA @@ -1580,7 +1584,7 @@ FFED..FFEE ; AL # So [2] HALFWIDTH BLACK SQUARE..HALFWIDTH WHITE CIRCLE 1FB00..1FB92 ; AL # So [147] BLOCK SEXTANT-1..UPPER HALF INVERSE MEDIUM SHADE AND LOWER HALF BLOCK 1FB94..1FBCA ; AL # So [55] LEFT HALF INVERSE MEDIUM SHADE AND RIGHT HALF BLOCK..WHITE UP-POINTING CHEVRON -# Total code points: 21731 +# Total code points: 21770 # ================================================ @@ -1881,7 +1885,7 @@ FE19 ; IN # Po PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS 0825..0827 ; CM # Mn [3] SAMARITAN VOWEL SIGN SHORT A..SAMARITAN VOWEL SIGN U 0829..082D ; CM # Mn [5] SAMARITAN VOWEL SIGN LONG I..SAMARITAN MARK NEQUDAA 0859..085B ; CM # Mn [3] MANDAIC AFFRICATION MARK..MANDAIC GEMINATION MARK -0898..089F ; CM # Mn [8] ARABIC SMALL HIGH WORD AL-JUZ..ARABIC HALF MADDA OVER MADDA +0897..089F ; CM # Mn [9] ARABIC PEPET..ARABIC HALF MADDA OVER MADDA 08CA..08E1 ; CM # Mn [24] ARABIC SMALL HIGH FARSI YEH..ARABIC SMALL HIGH SIGN SAFHA 08E3..0902 ; CM # Mn [32] ARABIC TURNED DAMMA BELOW..DEVANAGARI SIGN ANUSVARA 0903 ; CM # Mc DEVANAGARI SIGN VISARGA @@ -2146,7 +2150,7 @@ FFF9..FFFB ; CM # Cf [3] INTERLINEAR ANNOTATION ANCHOR..INTERLINEAR ANNOTAT 10AE5..10AE6 ; CM # Mn [2] MANICHAEAN ABBREVIATION MARK ABOVE..MANICHAEAN ABBREVIATION MARK BELOW 10D24..10D27 ; CM # Mn [4] HANIFI ROHINGYA SIGN HARBAHAY..HANIFI ROHINGYA SIGN TASSI 10EAB..10EAC ; CM # Mn [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK -10EFD..10EFF ; CM # Mn [3] ARABIC SMALL LOW WORD SAKTA..ARABIC SMALL LOW WORD MADDA +10EFC..10EFF ; CM # Mn [4] ARABIC COMBINING ALEF OVERLAY..ARABIC SMALL LOW WORD MADDA 10F46..10F50 ; CM # Mn [11] SOGDIAN COMBINING DOT BELOW..SOGDIAN COMBINING STROKE BELOW 10F82..10F85 ; CM # Mn [4] OLD UYGHUR COMBINING DOT ABOVE..OLD UYGHUR COMBINING TWO DOTS BELOW 11000 ; CM # Mc BRAHMI SIGN CANDRABINDU @@ -2336,7 +2340,7 @@ E0001 ; CM # Cf LANGUAGE TAG E0020..E007F ; CM # Cf [96] TAG SPACE..CANCEL TAG E0100..E01EF ; CM # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256 -# Total code points: 2429 +# Total code points: 2431 # ================================================ diff --git a/unicodetools/data/ucd/dev/extracted/DerivedName.txt b/unicodetools/data/ucd/dev/extracted/DerivedName.txt index ac78701d0..1c7f11fd5 100644 --- a/unicodetools/data/ucd/dev/extracted/DerivedName.txt +++ b/unicodetools/data/ucd/dev/extracted/DerivedName.txt @@ -1,5 +1,5 @@ # DerivedName-16.0.0.txt -# Date: 2023-10-02, 12:51:35 GMT +# Date: 2023-10-13, 15:52:34 GMT # © 2023 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see https://www.unicode.org/terms_of_use.html @@ -2098,6 +2098,7 @@ 088E ; ARABIC VERTICAL TAIL 0890 ; ARABIC POUND MARK ABOVE 0891 ; ARABIC PIASTRE MARK ABOVE +0897 ; ARABIC PEPET 0898 ; ARABIC SMALL HIGH WORD AL-JUZ 0899 ; ARABIC SMALL LOW WORD ISHMAAM 089A ; ARABIC SMALL LOW WORD IMAALA @@ -6488,6 +6489,8 @@ 1C86 ; CYRILLIC SMALL LETTER TALL HARD SIGN 1C87 ; CYRILLIC SMALL LETTER TALL YAT 1C88 ; CYRILLIC SMALL LETTER UNBLENDED UK +1C89 ; CYRILLIC CAPITAL LETTER TJE +1C8A ; CYRILLIC SMALL LETTER TJE 1C90 ; GEORGIAN MTAVRULI CAPITAL LETTER AN 1C91 ; GEORGIAN MTAVRULI CAPITAL LETTER BAN 1C92 ; GEORGIAN MTAVRULI CAPITAL LETTER GAN @@ -30067,6 +30070,10 @@ FFFD ; REPLACEMENT CHARACTER 10EAD ; YEZIDI HYPHENATION MARK 10EB0 ; YEZIDI LETTER LAM WITH DOT ABOVE 10EB1 ; YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE +10EC2 ; ARABIC LETTER DAL WITH TWO DOTS VERTICALLY BELOW +10EC3 ; ARABIC LETTER TAH WITH TWO DOTS VERTICALLY BELOW +10EC4 ; ARABIC LETTER KAF WITH TWO DOTS VERTICALLY BELOW +10EFC ; ARABIC COMBINING ALEF OVERLAY 10EFD ; ARABIC SMALL LOW WORD SAKTA 10EFE ; ARABIC SMALL LOW WORD QASR 10EFF ; ARABIC SMALL LOW WORD MADDA @@ -31946,6 +31953,50 @@ FFFD ; REPLACEMENT CHARACTER 11B07 ; DEVANAGARI SIGN WESTERN NINE-LIKE BHALE 11B08 ; DEVANAGARI SIGN REVERSED NINE-LIKE BHALE 11B09 ; DEVANAGARI SIGN MINDU +11BC0 ; SUNUWAR LETTER DEVI +11BC1 ; SUNUWAR LETTER TASLA +11BC2 ; SUNUWAR LETTER EKO +11BC3 ; SUNUWAR LETTER IMAR +11BC4 ; SUNUWAR LETTER REU +11BC5 ; SUNUWAR LETTER UTTHI +11BC6 ; SUNUWAR LETTER KIK +11BC7 ; SUNUWAR LETTER MA +11BC8 ; SUNUWAR LETTER APPHO +11BC9 ; SUNUWAR LETTER PIP +11BCA ; SUNUWAR LETTER GIL +11BCB ; SUNUWAR LETTER HAMSO +11BCC ; SUNUWAR LETTER CARMI +11BCD ; SUNUWAR LETTER NAH +11BCE ; SUNUWAR LETTER BUR +11BCF ; SUNUWAR LETTER JYAH +11BD0 ; SUNUWAR LETTER LOACHA +11BD1 ; SUNUWAR LETTER OTTHI +11BD2 ; SUNUWAR LETTER SHYELE +11BD3 ; SUNUWAR LETTER VARCA +11BD4 ; SUNUWAR LETTER YAT +11BD5 ; SUNUWAR LETTER AVA +11BD6 ; SUNUWAR LETTER AAL +11BD7 ; SUNUWAR LETTER DONGA +11BD8 ; SUNUWAR LETTER THARI +11BD9 ; SUNUWAR LETTER PHAR +11BDA ; SUNUWAR LETTER NGAR +11BDB ; SUNUWAR LETTER KHA +11BDC ; SUNUWAR LETTER SHYER +11BDD ; SUNUWAR LETTER CHELAP +11BDE ; SUNUWAR LETTER TENTU +11BDF ; SUNUWAR LETTER THELE +11BE0 ; SUNUWAR LETTER KLOKO +11BE1 ; SUNUWAR SIGN PVO +11BF0 ; SUNUWAR DIGIT ZERO +11BF1 ; SUNUWAR DIGIT ONE +11BF2 ; SUNUWAR DIGIT TWO +11BF3 ; SUNUWAR DIGIT THREE +11BF4 ; SUNUWAR DIGIT FOUR +11BF5 ; SUNUWAR DIGIT FIVE +11BF6 ; SUNUWAR DIGIT SIX +11BF7 ; SUNUWAR DIGIT SEVEN +11BF8 ; SUNUWAR DIGIT EIGHT +11BF9 ; SUNUWAR DIGIT NINE 11C00 ; BHAIKSUKI LETTER A 11C01 ; BHAIKSUKI LETTER AA 11C02 ; BHAIKSUKI LETTER I @@ -44178,6 +44229,6 @@ E01ED ; VARIATION SELECTOR-254 E01EE ; VARIATION SELECTOR-255 E01EF ; VARIATION SELECTOR-256 -# Total code points: 149815 +# Total code points: 149866 # EOF diff --git a/unicodetools/data/ucd/dev/extracted/DerivedNumericType.txt b/unicodetools/data/ucd/dev/extracted/DerivedNumericType.txt index 062f4fbe5..d5e24d4ed 100644 --- a/unicodetools/data/ucd/dev/extracted/DerivedNumericType.txt +++ b/unicodetools/data/ucd/dev/extracted/DerivedNumericType.txt @@ -1,5 +1,5 @@ -# DerivedNumericType-15.1.0.txt -# Date: 2023-01-05, 20:34:41 GMT +# DerivedNumericType-16.0.0.txt +# Date: 2023-10-11, 21:15:54 GMT # © 2023 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see https://www.unicode.org/terms_of_use.html @@ -272,6 +272,7 @@ FF10..FF19 ; Decimal # Nd [10] FULLWIDTH DIGIT ZERO..FULLWIDTH DIGIT NINE 11730..11739 ; Decimal # Nd [10] AHOM DIGIT ZERO..AHOM DIGIT NINE 118E0..118E9 ; Decimal # Nd [10] WARANG CITI DIGIT ZERO..WARANG CITI DIGIT NINE 11950..11959 ; Decimal # Nd [10] DIVES AKURU DIGIT ZERO..DIVES AKURU DIGIT NINE +11BF0..11BF9 ; Decimal # Nd [10] SUNUWAR DIGIT ZERO..SUNUWAR DIGIT NINE 11C50..11C59 ; Decimal # Nd [10] BHAIKSUKI DIGIT ZERO..BHAIKSUKI DIGIT NINE 11D50..11D59 ; Decimal # Nd [10] MASARAM GONDI DIGIT ZERO..MASARAM GONDI DIGIT NINE 11DA0..11DA9 ; Decimal # Nd [10] GUNJALA GONDI DIGIT ZERO..GUNJALA GONDI DIGIT NINE @@ -286,6 +287,6 @@ FF10..FF19 ; Decimal # Nd [10] FULLWIDTH DIGIT ZERO..FULLWIDTH DIGIT NINE 1E950..1E959 ; Decimal # Nd [10] ADLAM DIGIT ZERO..ADLAM DIGIT NINE 1FBF0..1FBF9 ; Decimal # Nd [10] SEGMENTED DIGIT ZERO..SEGMENTED DIGIT NINE -# Total code points: 680 +# Total code points: 690 # EOF diff --git a/unicodetools/data/ucd/dev/extracted/DerivedNumericValues.txt b/unicodetools/data/ucd/dev/extracted/DerivedNumericValues.txt index e67164682..bdde7e4f4 100644 --- a/unicodetools/data/ucd/dev/extracted/DerivedNumericValues.txt +++ b/unicodetools/data/ucd/dev/extracted/DerivedNumericValues.txt @@ -1,5 +1,5 @@ -# DerivedNumericValues-15.1.0.txt -# Date: 2023-01-05, 20:34:41 GMT +# DerivedNumericValues-16.0.0.txt +# Date: 2023-10-11, 21:15:54 GMT # © 2023 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see https://www.unicode.org/terms_of_use.html @@ -103,6 +103,7 @@ FF10 ; 0.0 ; ; 0 # Nd FULLWIDTH DIGIT ZERO 11730 ; 0.0 ; ; 0 # Nd AHOM DIGIT ZERO 118E0 ; 0.0 ; ; 0 # Nd WARANG CITI DIGIT ZERO 11950 ; 0.0 ; ; 0 # Nd DIVES AKURU DIGIT ZERO +11BF0 ; 0.0 ; ; 0 # Nd SUNUWAR DIGIT ZERO 11C50 ; 0.0 ; ; 0 # Nd BHAIKSUKI DIGIT ZERO 11D50 ; 0.0 ; ; 0 # Nd MASARAM GONDI DIGIT ZERO 11DA0 ; 0.0 ; ; 0 # Nd GUNJALA GONDI DIGIT ZERO @@ -126,7 +127,7 @@ FF10 ; 0.0 ; ; 0 # Nd FULLWIDTH DIGIT ZERO 1F10B..1F10C ; 0.0 ; ; 0 # No [2] DINGBAT CIRCLED SANS-SERIF DIGIT ZERO..DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT ZERO 1FBF0 ; 0.0 ; ; 0 # Nd SEGMENTED DIGIT ZERO -# Total code points: 88 +# Total code points: 89 # ================================================ @@ -514,6 +515,7 @@ FF11 ; 1.0 ; ; 1 # Nd FULLWIDTH DIGIT ONE 11731 ; 1.0 ; ; 1 # Nd AHOM DIGIT ONE 118E1 ; 1.0 ; ; 1 # Nd WARANG CITI DIGIT ONE 11951 ; 1.0 ; ; 1 # Nd DIVES AKURU DIGIT ONE +11BF1 ; 1.0 ; ; 1 # Nd SUNUWAR DIGIT ONE 11C51 ; 1.0 ; ; 1 # Nd BHAIKSUKI DIGIT ONE 11C5A ; 1.0 ; ; 1 # No BHAIKSUKI NUMBER ONE 11D51 ; 1.0 ; ; 1 # Nd MASARAM GONDI DIGIT ONE @@ -553,7 +555,7 @@ FF11 ; 1.0 ; ; 1 # Nd FULLWIDTH DIGIT ONE 1FBF1 ; 1.0 ; ; 1 # Nd SEGMENTED DIGIT ONE 2092A ; 1.0 ; ; 1 # Lo CJK UNIFIED IDEOGRAPH-2092A -# Total code points: 144 +# Total code points: 145 # ================================================ @@ -664,6 +666,7 @@ FF12 ; 2.0 ; ; 2 # Nd FULLWIDTH DIGIT TWO 11732 ; 2.0 ; ; 2 # Nd AHOM DIGIT TWO 118E2 ; 2.0 ; ; 2 # Nd WARANG CITI DIGIT TWO 11952 ; 2.0 ; ; 2 # Nd DIVES AKURU DIGIT TWO +11BF2 ; 2.0 ; ; 2 # Nd SUNUWAR DIGIT TWO 11C52 ; 2.0 ; ; 2 # Nd BHAIKSUKI DIGIT TWO 11C5B ; 2.0 ; ; 2 # No BHAIKSUKI NUMBER TWO 11D52 ; 2.0 ; ; 2 # Nd MASARAM GONDI DIGIT TWO @@ -707,7 +710,7 @@ FF12 ; 2.0 ; ; 2 # Nd FULLWIDTH DIGIT TWO 1FBF2 ; 2.0 ; ; 2 # Nd SEGMENTED DIGIT TWO 22390 ; 2.0 ; ; 2 # Lo CJK UNIFIED IDEOGRAPH-22390 -# Total code points: 146 +# Total code points: 147 # ================================================ @@ -809,6 +812,7 @@ FF13 ; 3.0 ; ; 3 # Nd FULLWIDTH DIGIT THREE 11733 ; 3.0 ; ; 3 # Nd AHOM DIGIT THREE 118E3 ; 3.0 ; ; 3 # Nd WARANG CITI DIGIT THREE 11953 ; 3.0 ; ; 3 # Nd DIVES AKURU DIGIT THREE +11BF3 ; 3.0 ; ; 3 # Nd SUNUWAR DIGIT THREE 11C53 ; 3.0 ; ; 3 # Nd BHAIKSUKI DIGIT THREE 11C5C ; 3.0 ; ; 3 # No BHAIKSUKI NUMBER THREE 11D53 ; 3.0 ; ; 3 # Nd MASARAM GONDI DIGIT THREE @@ -855,7 +859,7 @@ FF13 ; 3.0 ; ; 3 # Nd FULLWIDTH DIGIT THREE 22998 ; 3.0 ; ; 3 # Lo CJK UNIFIED IDEOGRAPH-22998 23B1B ; 3.0 ; ; 3 # Lo CJK UNIFIED IDEOGRAPH-23B1B -# Total code points: 144 +# Total code points: 145 # ================================================ @@ -951,6 +955,7 @@ FF14 ; 4.0 ; ; 4 # Nd FULLWIDTH DIGIT FOUR 11734 ; 4.0 ; ; 4 # Nd AHOM DIGIT FOUR 118E4 ; 4.0 ; ; 4 # Nd WARANG CITI DIGIT FOUR 11954 ; 4.0 ; ; 4 # Nd DIVES AKURU DIGIT FOUR +11BF4 ; 4.0 ; ; 4 # Nd SUNUWAR DIGIT FOUR 11C54 ; 4.0 ; ; 4 # Nd BHAIKSUKI DIGIT FOUR 11C5D ; 4.0 ; ; 4 # No BHAIKSUKI NUMBER FOUR 11D54 ; 4.0 ; ; 4 # Nd MASARAM GONDI DIGIT FOUR @@ -996,7 +1001,7 @@ FF14 ; 4.0 ; ; 4 # Nd FULLWIDTH DIGIT FOUR 200E2 ; 4.0 ; ; 4 # Lo CJK UNIFIED IDEOGRAPH-200E2 2626D ; 4.0 ; ; 4 # Lo CJK UNIFIED IDEOGRAPH-2626D -# Total code points: 135 +# Total code points: 136 # ================================================ @@ -1096,6 +1101,7 @@ FF15 ; 5.0 ; ; 5 # Nd FULLWIDTH DIGIT FIVE 11735 ; 5.0 ; ; 5 # Nd AHOM DIGIT FIVE 118E5 ; 5.0 ; ; 5 # Nd WARANG CITI DIGIT FIVE 11955 ; 5.0 ; ; 5 # Nd DIVES AKURU DIGIT FIVE +11BF5 ; 5.0 ; ; 5 # Nd SUNUWAR DIGIT FIVE 11C55 ; 5.0 ; ; 5 # Nd BHAIKSUKI DIGIT FIVE 11C5E ; 5.0 ; ; 5 # No BHAIKSUKI NUMBER FIVE 11D55 ; 5.0 ; ; 5 # Nd MASARAM GONDI DIGIT FIVE @@ -1139,7 +1145,7 @@ FF15 ; 5.0 ; ; 5 # Nd FULLWIDTH DIGIT FIVE 1FBF5 ; 5.0 ; ; 5 # Nd SEGMENTED DIGIT FIVE 20121 ; 5.0 ; ; 5 # Lo CJK UNIFIED IDEOGRAPH-20121 -# Total code points: 133 +# Total code points: 134 # ================================================ @@ -1229,6 +1235,7 @@ FF16 ; 6.0 ; ; 6 # Nd FULLWIDTH DIGIT SIX 11736 ; 6.0 ; ; 6 # Nd AHOM DIGIT SIX 118E6 ; 6.0 ; ; 6 # Nd WARANG CITI DIGIT SIX 11956 ; 6.0 ; ; 6 # Nd DIVES AKURU DIGIT SIX +11BF6 ; 6.0 ; ; 6 # Nd SUNUWAR DIGIT SIX 11C56 ; 6.0 ; ; 6 # Nd BHAIKSUKI DIGIT SIX 11C5F ; 6.0 ; ; 6 # No BHAIKSUKI NUMBER SIX 11D56 ; 6.0 ; ; 6 # Nd MASARAM GONDI DIGIT SIX @@ -1267,7 +1274,7 @@ FF16 ; 6.0 ; ; 6 # Nd FULLWIDTH DIGIT SIX 1FBF6 ; 6.0 ; ; 6 # Nd SEGMENTED DIGIT SIX 20AEA ; 6.0 ; ; 6 # Lo CJK UNIFIED IDEOGRAPH-20AEA -# Total code points: 117 +# Total code points: 118 # ================================================ @@ -1356,6 +1363,7 @@ FF17 ; 7.0 ; ; 7 # Nd FULLWIDTH DIGIT SEVEN 11737 ; 7.0 ; ; 7 # Nd AHOM DIGIT SEVEN 118E7 ; 7.0 ; ; 7 # Nd WARANG CITI DIGIT SEVEN 11957 ; 7.0 ; ; 7 # Nd DIVES AKURU DIGIT SEVEN +11BF7 ; 7.0 ; ; 7 # Nd SUNUWAR DIGIT SEVEN 11C57 ; 7.0 ; ; 7 # Nd BHAIKSUKI DIGIT SEVEN 11C60 ; 7.0 ; ; 7 # No BHAIKSUKI NUMBER SEVEN 11D57 ; 7.0 ; ; 7 # Nd MASARAM GONDI DIGIT SEVEN @@ -1393,7 +1401,7 @@ FF17 ; 7.0 ; ; 7 # Nd FULLWIDTH DIGIT SEVEN 1FBF7 ; 7.0 ; ; 7 # Nd SEGMENTED DIGIT SEVEN 20001 ; 7.0 ; ; 7 # Lo CJK UNIFIED IDEOGRAPH-20001 -# Total code points: 117 +# Total code points: 118 # ================================================ @@ -1479,6 +1487,7 @@ FF18 ; 8.0 ; ; 8 # Nd FULLWIDTH DIGIT EIGHT 11738 ; 8.0 ; ; 8 # Nd AHOM DIGIT EIGHT 118E8 ; 8.0 ; ; 8 # Nd WARANG CITI DIGIT EIGHT 11958 ; 8.0 ; ; 8 # Nd DIVES AKURU DIGIT EIGHT +11BF8 ; 8.0 ; ; 8 # Nd SUNUWAR DIGIT EIGHT 11C58 ; 8.0 ; ; 8 # Nd BHAIKSUKI DIGIT EIGHT 11C61 ; 8.0 ; ; 8 # No BHAIKSUKI NUMBER EIGHT 11D58 ; 8.0 ; ; 8 # Nd MASARAM GONDI DIGIT EIGHT @@ -1515,7 +1524,7 @@ FF18 ; 8.0 ; ; 8 # Nd FULLWIDTH DIGIT EIGHT 1F109 ; 8.0 ; ; 8 # No DIGIT EIGHT COMMA 1FBF8 ; 8.0 ; ; 8 # Nd SEGMENTED DIGIT EIGHT -# Total code points: 112 +# Total code points: 113 # ================================================ @@ -1604,6 +1613,7 @@ FF19 ; 9.0 ; ; 9 # Nd FULLWIDTH DIGIT NINE 11739 ; 9.0 ; ; 9 # Nd AHOM DIGIT NINE 118E9 ; 9.0 ; ; 9 # Nd WARANG CITI DIGIT NINE 11959 ; 9.0 ; ; 9 # Nd DIVES AKURU DIGIT NINE +11BF9 ; 9.0 ; ; 9 # Nd SUNUWAR DIGIT NINE 11C59 ; 9.0 ; ; 9 # Nd BHAIKSUKI DIGIT NINE 11C62 ; 9.0 ; ; 9 # No BHAIKSUKI NUMBER NINE 11D59 ; 9.0 ; ; 9 # Nd MASARAM GONDI DIGIT NINE @@ -1641,7 +1651,7 @@ FF19 ; 9.0 ; ; 9 # Nd FULLWIDTH DIGIT NINE 1FBF9 ; 9.0 ; ; 9 # Nd SEGMENTED DIGIT NINE 2F890 ; 9.0 ; ; 9 # Lo CJK COMPATIBILITY IDEOGRAPH-2F890 -# Total code points: 118 +# Total code points: 119 # ================================================ diff --git a/unicodetools/src/main/java/org/unicode/props/UcdLineParser.java b/unicodetools/src/main/java/org/unicode/props/UcdLineParser.java index 712d5e0c0..c17f3c326 100644 --- a/unicodetools/src/main/java/org/unicode/props/UcdLineParser.java +++ b/unicodetools/src/main/java/org/unicode/props/UcdLineParser.java @@ -111,6 +111,11 @@ public boolean hasNext() { return false; } line = line2 = rawLines.next(); + if (line.startsWith("<<<<<<<") + || line.startsWith("=======") + || line.startsWith(">>>>>>>")) { + line2 = ""; + } ++stats.lineCount; final int hashPos = line2.indexOf('#'); if (hashPos >= 0) { diff --git a/unicodetools/src/main/java/org/unicode/props/UcdPropertyValues.java b/unicodetools/src/main/java/org/unicode/props/UcdPropertyValues.java index c5eb7e092..a9ca850c6 100644 --- a/unicodetools/src/main/java/org/unicode/props/UcdPropertyValues.java +++ b/unicodetools/src/main/java/org/unicode/props/UcdPropertyValues.java @@ -451,6 +451,7 @@ public enum Block_Values implements Named { Specials("Specials"), Sundanese("Sundanese"), Sundanese_Supplement("Sundanese_Sup"), + Sunuwar("Sunuwar"), Supplemental_Arrows_A("Sup_Arrows_A"), Supplemental_Arrows_B("Sup_Arrows_B"), Supplemental_Arrows_C("Sup_Arrows_C"), @@ -1833,6 +1834,7 @@ public enum Script_Values implements Named { Sora_Sompeng("Sora"), Soyombo("Soyo"), Sundanese("Sund"), + Sunuwar("Sunu"), Syloti_Nagri("Sylo"), Syriac("Syrc"), Tagbanwa("Tagb"), diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/MakeUnicodeFiles.java b/unicodetools/src/main/java/org/unicode/text/UCD/MakeUnicodeFiles.java index e1ff508ad..759361106 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/MakeUnicodeFiles.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/MakeUnicodeFiles.java @@ -67,6 +67,7 @@ static class Format { Map> fileToPropertySet = new TreeMap>(); Map fileToComments = new TreeMap(); Map fileToDirectory = new TreeMap(); + Map> propertyToOrderedValues = new TreeMap>(); Map> propertyToValueToComments = new TreeMap>(); Map hackMap = new HashMap(); @@ -110,6 +111,12 @@ public static class PrintStyle { // Unicode 15.1 and later LineBreak.txt and EastAsianWidth.txt, which are all generated // in that format by some other tool. boolean kenFile = false; + // Whether the file should be produced in the style of IndicPositionalCategory.txt and + // IndicSyllabicCategory.txt, which are both generated in that format by some other + // tool. + boolean roozbehFile = false; + // Whether to separate values of enumerated properties using a line of equal signs. + boolean separateValues = true; boolean hackValues = false; boolean mergeRanges = true; String nameStyle = "none"; @@ -138,6 +145,10 @@ String parse(String options) { interleaveValues = true; } else if (piece.equals("kenFile")) { kenFile = true; + } else if (piece.equals("roozbehFile")) { + roozbehFile = true; + } else if (piece.startsWith("separateValues=")) { + separateValues = afterEqualsBoolean(piece); } else if (piece.equals("hackValues")) { hackValues = true; } else if (piece.equals("sortNumeric")) { @@ -301,6 +312,10 @@ private void build() { } line = line.trim(); if (line.length() == 0) { + if (comments.length() != 0) { + // Preserve blank lines between comments. + comments += "\n"; + } continue; } if (DEBUG) { @@ -321,6 +336,7 @@ private void build() { comments += line; } else { // end of comments, roll up + comments = comments.trim(); if (comments.length() != 0) { if (property != null) { addValueComments(property, value, comments); @@ -350,6 +366,10 @@ private void build() { value = ""; } else if (line.startsWith("Value:")) { value = lineValue; + final var values = + propertyToOrderedValues.computeIfAbsent( + property, k -> new ArrayList()); + values.add(value); } else if (line.startsWith("HackName:")) { final String regularItem = Utility.getUnskeleton(lineValue, true); hackMap.put(regularItem, lineValue); @@ -1152,6 +1172,9 @@ public static void generatePropertyFile(String filename) throws IOException { filename, Format.theFormat.getPrintStyle(name)); if (!ps.kenFile) { pwProp.println(); + if (!ps.separateValues) { + pwProp.println(); + } pwProp.println(SEPARATOR); } final String propComment = Format.theFormat.getValueComments(name, ""); @@ -1161,7 +1184,11 @@ public static void generatePropertyFile(String filename) throws IOException { pwProp.println(propComment); } else if (!prop.isType(UnicodeProperty.BINARY_MASK)) { pwProp.println(); - pwProp.println("# Property:\t" + name); + if (ps.roozbehFile) { + pwProp.println("# Property: " + name); + } else { + pwProp.println("# Property:\t" + name); + } } } @@ -1182,9 +1209,12 @@ public static void generatePropertyFile(String filename) throws IOException { v = v + " (" + v2 + ")"; } } - pwProp.println(); + pwProp.println(ps.roozbehFile ? "#" : ""); pwProp.println("# All code points not explicitly listed for " + prop.getName()); - pwProp.println("# have the value " + v + "."); + pwProp.println( + "# have the value " + + v + + (ps.roozbehFile && v.equals("NA") ? " (not applicable)." : ".")); } if (!ps.interleaveValues && prop.isType(UnicodeProperty.BINARY_MASK)) { @@ -1254,6 +1284,21 @@ private static void writeEnumeratedValues( temp2.addAll(aliases); aliases = temp2; } + if (ps.roozbehFile) { + aliases.removeIf(alias -> UnicodeProperty.compareNames(alias, ps.skipValue) == 0); + if (!Format.theFormat + .propertyToOrderedValues + .get(prop.getName()) + .containsAll(aliases)) { + final TreeSet missingAliases = new TreeSet(aliases); + missingAliases.removeAll( + Format.theFormat.propertyToOrderedValues.get(prop.getName())); + throw new IllegalArgumentException( + "All values must be listed when using roozbehFile; missing " + + missingAliases); + } + aliases = Format.theFormat.propertyToOrderedValues.get(prop.getName()); + } if (ps.sortNumeric) { if (DEBUG) { System.out.println("Reordering"); @@ -1284,7 +1329,7 @@ private static void writeEnumeratedValues( final String missing = ps.skipUnassigned != null ? ps.skipUnassigned : ps.skipValue; if (missing != null && !missing.equals(UCD_Names.NO)) { - pw.println(); + pw.println(ps.roozbehFile ? "#" : ""); final String propName = bf.getPropName(); // if (propName == null) propName = ""; // else if (propName.length() != 0) propName = propName + "; "; @@ -1302,6 +1347,10 @@ private static void writeEnumeratedValues( writeEnumeratedMissingValues(pw, overallDefault, defaultLbValues); } } + if (!ps.separateValues) { + pw.println(); + pw.println(SEPARATOR.replace('=', '-')); + } for (final Iterator it = aliases.iterator(); it.hasNext(); ) { final String value = it.next(); if (DEBUG) { @@ -1416,9 +1465,13 @@ private static void writeEnumeratedValues( if (!prop.isType(UnicodeProperty.BINARY_MASK)) { pw.println(); - pw.println(SEPARATOR); + if (ps.separateValues) { + pw.println(SEPARATOR); + } if (nonLongValue) { - pw.println(); + if (ps.separateValues) { + pw.println(); + } pw.println("# " + prop.getName() + "=" + value); } } @@ -1442,6 +1495,11 @@ private static void writeEnumeratedValues( pw.println(); // if (s.size() != 0) bf.setMergeRanges(ps.mergeRanges); + bf.setShowTotal(!ps.roozbehFile); + if (ps.roozbehFile) { + bf.setRangeBreakSource( + ToolUnicodePropertySource.make(Default.ucdVersion()).getProperty("Block")); + } bf.showSetNames(pw, s); if (DEBUG) { System.out.println(bf.showSetNames(s)); diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java index af97cfdac..dbbea74f6 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java @@ -503,7 +503,7 @@ private static PropertyComparison getPropertyComparison(ParsePosition pp, String propertyComparison.valueSet = new UnicodeSet(line, pp, symbolTable); propertyComparison.property1 = CompoundProperty.of(LATEST_PROPS, line, pp); final int cp = line.codePointAt(pp.getIndex()); - if (cp != '=' && cp != 'x') { + if (cp != '=' && cp != '≠') { throw new ParseException(line, pp.getIndex()); } propertyComparison.shouldBeEqual = cp == '='; diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/UCD_Names.java b/unicodetools/src/main/java/org/unicode/text/UCD/UCD_Names.java index c48269675..0f12b0ffc 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/UCD_Names.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/UCD_Names.java @@ -414,19 +414,19 @@ public final class UCD_Names implements UCD_Types { // Unicode 15 "Kawi", "Nag_Mundari", - // A future version of Unicode - "Sunuwar", - "Tulu_Tigalari", - "Kirat_Rai", - "Todhri", + // Unicode 16 "Garay", "Gurung_Khema", + "Kirat_Rai", "Ol_Onal", + "Sunuwar", + "Todhri", + "Tulu_Tigalari", // Provisionally assigned - "Sidetic", "Chisoi", - "Tolong_Siki", + "Sidetic", "Tai_Yo", + "Tolong_Siki", }; public static final Relation EXTRA_SCRIPT = @@ -611,19 +611,19 @@ public final class UCD_Names implements UCD_Types { // Unicode 15 "Kawi", "Nagm", - // A future version of Unicode - "Qaba", - "Qabb", - "Qabc", - "Qabd", - "Qabe", - "Qabf", - "Qabg", + // Unicode 16 + "Gara", + "Gukh", + "Krai", + "Onao", + "Sunu", + "Todr", + "Tutg", // Provisionally assigned - "Qabh", - "Qabi", - "Qabj", - "Qabk", + "Chis", + "Sidt", + "Tayo", + "Tols", }; static final String[] SHORT_AGE = { diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/UCD_Types.java b/unicodetools/src/main/java/org/unicode/text/UCD/UCD_Types.java index 6f5a76340..972753c37 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/UCD_Types.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/UCD_Types.java @@ -599,20 +599,20 @@ public interface UCD_Types { // Unicode 15 Kawi = 164, Nag_Mundari = 165, - // A future version of Unicode - Sunuwar = 166, - Tulu_Tigalari = 167, + // Unicode 16 + Garay = 166, + Gurung_Khema = 167, Kirat_Rai = 168, - Todhri = 169, - Garay = 170, - Gurung_Khema = 171, - Ol_Onal = 172, + Ol_Onal = 169, + Sunuwar = 170, + Todhri = 171, + Tulu_Tigalari = 172, // Provisionally assigned - Sidetic = 173, - Chisoi = 174, - Tolong_Siki = 175, - Tai_Yo = 176, - LIMIT_SCRIPT = Tai_Yo + 1; + Chisoi = 173, + Sidetic = 174, + Tai_Yo = 175, + Tolong_Siki = 176, + LIMIT_SCRIPT = Tolong_Siki + 1; // Bidi_Paired_Bracket_Type public static final byte BPT_N = 0, BPT_O = 1, BPT_C = 2, LIMIT_BPT = 3; diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/MakeUnicodeFiles.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/MakeUnicodeFiles.txt index 702c46ca5..db8ebd7b8 100644 --- a/unicodetools/src/main/resources/org/unicode/text/UCD/MakeUnicodeFiles.txt +++ b/unicodetools/src/main/resources/org/unicode/text/UCD/MakeUnicodeFiles.txt @@ -908,6 +908,376 @@ Format: kenFile skipValue=Rotated # Property: VerticalOrientation +File: IndicPositionalCategory +# +# This file defines the following property: +# +# Indic_Positional_Category enumerated property +# +# Scope: This property is aimed at the problem of +# the specification of syllabic structure for Indic scripts. +# Because dependent vowels (matras), visible viramas, and other +# characters are placed in notional slots around the consonant (or +# consonant cluster) core of an Indic syllable, there may be +# cooccurrence constraints or other interactions. Also, it may be +# desirable, in cases where more than one such character may occur in +# sequence, as for example, in a top slot and a bottom slot, to +# specify preferred orders for spelling. As such, this property +# is designed primarily to supplement the Indic_Syllabic_Category +# property. +# +# In addition to combining marks associated with Indic scripts, the +# Indic_Positional_Category has non-trivial values for special signs +# associated with Indic_Syllabic_Category=Consonant_Prefixed +# or Indic_Syllabic_Category=Consonant_Preceding_Repha. Those signs +# have General_Category=Lo, rather than being combining marks. +# They occur in initial position in syllabic structure. However, when +# rendered, they appear as marks positioned with respect to another +# base letter (usually above it). Hence, having an explicit value for +# Indic_Positional_Category for those signs can be helpful. +# +# Note that this property is *not* intended as +# a prescriptive property regarding display or font design, +# for a number of reasons. Good font design requires information +# that is outside the context of a character encoding standard, +# and is best handled in other venues. For Indic dependent +# vowels and similar characters, in particular: +# +# 1. Matra placement may vary somewhat based on typeface design. +# 2. Matra placement, even within a single script, may vary +# somewhat according to historic period or local conventions. +# 3. Matra placement may be changed by explicit orthographic reform +# decisions. +# 4. Matras may ligate in various ways with a consonant (or even +# other elements of a syllable) instead of occurring in a +# discrete location. +# 5. Matra display may be contextually determined. This is +# notable, for example, in the Tamil script, where the shape +# and placement of -u and -uu vowels depends strongly on +# which consonant they adjoin. +# +# Format: +# Field 0 Unicode code point value or range of code point values +# Field 1 Indic_Positional_Category property value +# +# Field 1 is followed by a comment field, starting with the number sign '#', +# which shows the General_Category property value, the Unicode character name +# or names, and, in lines with ranges of code points, the code point count in +# square brackets. +# +# The scripts assessed as containing dependent vowels or similar characters +# in the structural sense used for the Indic_Positional_Category are the +# following: +# +# Ahom, Balinese, Batak, Bengali, Bhaiksuki, Brahmi, Buginese, Buhid, +# Chakma, Cham, Devanagari, Dives Akuru, Dogra, Grantha, Gujarati, +# Gunjala Gondi, Gurmukhi, Hanunoo, Javanese, Kaithi, Kannada, Kawi, +# Kayah Li, Kharoshthi, Khmer, Khojki, Khudawadi, Lao, Lepcha, Limbu, +# Makasar, Malayalam, Marchen, Masaram Gondi, Meetei Mayek, Modi, +# Myanmar, Nandinagari, Newa, New Tai Lue, Oriya, Rejang, Saurashtra, +# Sharada, Siddham, Sinhala, Soyombo, Sundanese, Syloti Nagri, +# Tagalog, Tagbanwa, Tai Tham, Tai Viet, Takri, Tamil, Telugu, Thai, +# Tibetan, Tirhuta, and Zanabazar Square. +# +# All characters for all other scripts not in that list +# take the default value for this property. +# +# See IndicSyllabicCategory.txt for a slightly more extended +# list of Indic scripts, including those which do not have +# positional characters. Currently, those additional +# Indic scripts without positional characters are +# Multani, Phags-pa, and Tai Le. +# +# Notes: +# +# 1. The following characters are all assigned the positional category Right, +# but may have different positions in some cases: +# * U+0BC1 TAMIL VOWEL SIGN U and U+0BC2 TAMIL VOWEL SIGN UU have +# contextually variable placement in Tamil. +# * U+0D41 MALAYALAM VOWEL SIGN U and U+0D42 MALAYALAM VOWEL SIGN UU form +# complex ligatures with consonants in older Malayalam orthography. +# * U+11341 GRANTHA VOWEL SIGN U and U+11342 GRANTHA VOWEL SIGN UU have +# contextually variable placement in Grantha. +# * U+11440 NEWA VOWEL SIGN O and U+11441 NEWA VOWEL SIGN AU have contextually +# variable placement in Newa. +# +# 2. The following characters are all assigned the positional category Top, +# but may have different positions in some cases: +# * U+1143E NEWA VOWEL SIGN E and U+1143F NEWA VOWEL SIGN AI have contextually +# variable placement in Newa. +# +# 3. The following characters are all assigned the positional category Bottom, +# but may have different positions in some cases: +# * U+102F MYANMAR VOWEL SIGN U and U+1030 MYANMAR VOWEL SIGN UU have +# contextually variable placement in Myanmar. +# * U+1A69 TAI THAM VOWEL SIGN U and U+1A6A TAI THAM VOWEL SIGN UU have +# contextually variable placement in Tai Tham. +# +# 4. The following character is assigned the positional category Left, but +# may have different positions in different styles: +# * U+119D2 NANDINAGARI VOWEL SIGN I has stylistically variable placement +# in Nandinagari. +Property: Indic_Positional_Category +Format: roozbehFile separateValues=false valueStyle=short skipValue=NA +Value: Right +Value: Left +Value: Visual_Order_Left + +# These are dependent vowels that occur to the left of the consonant +# letter in a syllable, but which occur in scripts using the visual order +# model, instead of the logical order model. Because of the different +# model, these left-side vowels occur first in the backing store (before +# the consonant letter) and are not reordered during text rendering. +# +# [Derivation: Logical_Order_Exception=Yes] +Value: Left_And_Right +Value: Top +Value: Bottom +Value: Top_And_Bottom +Value: Top_And_Right +Value: Top_And_Left +Value: Top_And_Left_And_Right +Value: Bottom_And_Right +Value: Bottom_And_Left +Value: Top_And_Bottom_And_Right +Value: Top_And_Bottom_And_Left +Value: Overstruck + +File: IndicSyllabicCategory +# +# This file defines the following property: +# +# Indic_Syllabic_Category enumerated property +# +# Scope: This property is aimed at two general problem +# areas involving the analysis and processing of Indic scripts: +# +# 1. Specification of syllabic structure. +# 2. Specification of segmentation rules. +# +# Both of these problem areas may benefit from having defined subtypes +# of Indic script characters which are relevant to how Indic +# syllables (or aksaras) are constructed. Note that rules for +# syllabic structure in Indic scripts may differ significantly +# from how phonological syllables are defined. +# +# Format: +# Field 0 Unicode code point value or range of code point values +# Field 1 Indic_Syllabic_Category property value +# +# Field 1 is followed by a comment field, starting with the number sign '#', +# which shows the General_Category property value, the Unicode character name +# or names, and, in lines with ranges of code points, the code point count in +# square brackets. +# +# The scripts assessed as Indic in the structural sense used for the +# Indic_Syllabic_Category are the following: +# +# Ahom, Balinese, Batak, Bengali, Bhaiksuki, Brahmi, Buginese, Buhid, +# Chakma, Cham, Devanagari, Dives Akuru, Dogra, Grantha, Gujarati, +# Gunjala Gondi, Gurmukhi, Hanunoo, Javanese, Kaithi, Kannada, Kawi, +# Kayah Li, Kharoshthi, Khmer, Khojki, Khudawadi, Lao, Lepcha, Limbu, +# Mahajani, Makasar, Malayalam, Marchen, Masaram Gondi, Meetei Mayek, +# Modi, Multani, Myanmar, Nandinagari, Newa, New Tai Lue, Oriya, +# Phags-pa, Rejang, Saurashtra, Sharada, Siddham, Sinhala, Soyombo, +# Sundanese, Syloti Nagri, Tagalog, Tagbanwa, Tai Le, Tai Tham, +# Tai Viet, Takri, Tamil, Telugu, Thai, Tibetan, Tirhuta, and +# Zanabazar Square. +# +# All characters for all other scripts not in that list +# take the default value for this property, unless they +# are individually listed in this data file. +# +Property: Indic_Syllabic_Category +Format: roozbehFile valueStyle=short skipValue=Other +Value: Bindu +# Bindu/Anusvara (nasalization or -n) + +# [Not derivable] +Value: Visarga +# Visarga (-h) +# Excludes letters for jihvamuliya and upadhmaniya, which are +# related, but structured somewhat differently. + +# [Not derivable] +Value: Avagraha +# Avagraha (elision of initial a- in sandhi) + +# [Not derivable] +Value: Nukta +# Nukta (diacritic for borrowed consonants or other consonant +# modifications). Note that while the resulting sound is typically a +# consonant, the base letter a nukta follows may be an independent +# vowel. For example, is used to transcribe ARABIC LETTER +# AIN. + +# [Not derivable] +Value: Virama +# Virama (killing of inherent vowel in consonant sequence +# or consonant stacker) +# Only includes characters that can act both as visible killer viramas +# and consonant stackers. Separate property values exist for characters +# that can only act as pure killers or only as consonant stackers. + +# [Derivation: (ccc=9) - (InSC=Pure_Killer) - (InSC=Invisible_Stacker) +# - (InSC=Number_Joiner) - 2D7F] +Value: Pure_Killer +# Pure killer (killing of inherent vowel in consonant sequence, +# with no consonant stacking behavior) + +# [Not derivable] +Value: Invisible_Stacker +# Invisible stacker (invisible consonant stacker virama). +# +# Note that in some scripts, such as Kharoshthi and Masaram Gondi, an invisible +# stacker may have a second function, changing the shape and/or location of the +# consonant preceding it, even when there is no consonant following the +# invisible stacker. + +# [Not derivable] +Value: Vowel_Independent +# Independent Vowels (contrasted with matras) + +# [Not derivable] +Value: Vowel_Dependent +# Dependent Vowels (contrasted with independent vowels and/or with +# complex placement). Known as matras in Indic scripts. Also +# includes vowel modifiers that follow dependent (and sometimes +# independent) vowels. + +# [Not derivable] +Value: Vowel +# (Other) Vowels (reanalyzed as ordinary alphabetic letters or marks) + +# [Not derivable] +Value: Consonant_Placeholder +# Consonant Placeholder +# This includes generic placeholders used for +# Indic script layout (NBSP and dotted circle), as well as a few script- +# specific vowel-holder characters which are not technically +# consonants, but serve instead as bases for placement of vowel marks. + +# [Not derivable] +Value: Consonant +# Consonant (ordinary abugida consonants, with inherent vowels) + +# [Not derivable] +Value: Consonant_Dead +# Dead Consonant (special consonant with killed vowel) + +# [Not derivable] +Value: Consonant_With_Stacker +# Consonants that may make stacked ligatures with the next consonant +# without the use of a virama + +# [Not derivable] +Value: Consonant_Prefixed +# Cluster-initial consonants + +# [Not derivable] +Value: Consonant_Preceding_Repha +# Repha Form of RA (reanalyzed in some scripts), when preceding the main +# consonant. + +# [Not derivable] +Value: Consonant_Initial_Postfixed +# Consonants that succeed the main consonant in character sequences, but are +# pronounced before it. + +# [Not derivable] +Value: Consonant_Succeeding_Repha +# Repha Form of RA (reanalyzed in some scripts), when succeeding the main +# consonant. + +# [Not derivable] +Value: Consonant_Subjoined +# Subjoined Consonant (C2 form subtending a base consonant in Tibetan, etc.) + +# [Not derivable] +Value: Consonant_Medial +# Medial Consonant (medial liquid, occurring in clusters) + +# [Not derivable] +Value: Consonant_Final +# Final Consonant (special final forms which do not take vowels) + +# [Not derivable] +Value: Consonant_Head_Letter +# Head Letter (Tibetan) + +# [Not derivable] +Value: Modifying_Letter +# Reanalyzed letters not participating in the abugida structure, but +# serving to modify the sound of an adjacent vowel or consonant. +# Note that this is not the same as General_Category=Modifier_Letter. + +# [Not derivable] +Value: Tone_Letter +# Tone Letter (spacing lexical tone mark with status as a letter) + +# [Not derivable] +Value: Tone_Mark +# Tone Mark (nonspacing or spacing lexical tone mark) + +# [Not derivable] +Value: Gemination_Mark +# Gemination Mark (doubling of the preceding or following consonant) +# +# U+0A71 GURMUKHI ADDAK precedes the consonant it geminates, while the +# others follow the consonant they geminate. + +# [Not derivable] +Value: Cantillation_Mark +# Cantillation Mark (recitation marks, such as svara markers for the Samaveda) + +# [Not derivable] +Value: Register_Shifter +# Register Shifter (shifts register for consonants, akin to a tone mark) + +# [Not derivable] +Value: Syllable_Modifier +# Syllable Modifier (miscellaneous combining characters that modify +# something in the orthographic syllable they succeed or appear in) + +# [Not derivable] +Value: Consonant_Killer +# Consonant Killer (signifies that the previous consonant or consonants are +# not pronounced) + +# [Not derivable] +Value: Non_Joiner +# Non_Joiner (Zero Width Non-Joiner) + +# [Not derivable] +Value: Joiner +# Joiner (Zero Width Joiner) + +# [Not derivable] +Value: Number_Joiner +# Number_Joiner (forms ligatures between numbers for multiplication) + +# [Not derivable] +Value: Number +# Number (can be used as vowel-holders like consonant placeholders) +# Note: A number may even hold subjoined consonants which may in turn +# have been formed using a virama or a stacker, e.g. the sequence +# where THAI THAM LETTER LOW TA is subjoined to +# TAI THAM THAM DIGIT THREE using an invisible stacker. + +# [Not derivable] +Value: Brahmi_Joining_Number +# Brahmi Joining Number (may be joined by a Number_Joiner of the same +# script, e.g. in Brahmi) +# +# Note: These are different from Numbers, in the way that there is no known +# evidence of Brahmi Joining Numbers taking vowels or subjoined consonants. +# Until such evidence is found, implementations may assume that Brahmi +# Joining Numbers only participate in shaping with other Brahmi Joining +# Numbers. + +# [Not derivable] + File: UnicodeData Property: SPECIAL diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/ShortBlockNames.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/ShortBlockNames.txt index 32a6ce8cc..3fc09e21a 100644 --- a/unicodetools/src/main/resources/org/unicode/text/UCD/ShortBlockNames.txt +++ b/unicodetools/src/main/resources/org/unicode/text/UCD/ShortBlockNames.txt @@ -272,6 +272,7 @@ Soyombo ; Soyombo Specials ; Specials Sundanese ; Sundanese Sundanese_Sup ; Sundanese_Supplement +Sunuwar ; Sunuwar Super_And_Sub ; Superscripts_And_Subscripts Sup_Arrows_A ; Supplemental_Arrows_A Sup_Arrows_B ; Supplemental_Arrows_B diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt index 511f0967b..8ee8762b3 100644 --- a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt +++ b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt @@ -405,12 +405,30 @@ Let $identifier_extend = [\p{GC=Mn}\p{GC=Mc}\p{GC=Nd}\p{GC=Pc}] In \P{U-1:GC=Cn} ccc=U-1:ccc # Canonical decompositions (minus exclusions) must be identical across releases (also required by strong normalization stability), -# except where a character and at lease one character in its decomposition are both new in the release. -Let $NFC_Exceptions = [\U0001109A\U0001109C\U000110AB[\U0001112E\U0001112F \U0001134B-\U0001134C \U000114BB-\U000114BC \U000114BE \U000115BA-\U000115BB] \U00011938] -# 6.1.0 Added CHAKMA VOWEL SIGN O..CHAKMA VOWEL SIGN AU -# 7.0 Added 1134B..1134C, 114BB..114BC, 114BE, and 115BA..115BB -# 13.0 Added 11938 DIVES AKURU VOWEL SIGN O -[\p{Decomposition_Type=Canonical} - \p{Full_Composition_Exclusion} - $NFC_Exceptions] = [\p{U-1:Decomposition_Type=Canonical} - \p{U-1:Full_Composition_Exclusion} - $NFC_Exceptions] +# except where a character and at least one character in its decomposition are both new in the release. +Let $New_Decompositions = [[\p{Decomposition_Type=Canonical} - \p{Full_Composition_Exclusion}] - [\p{U-1:Decomposition_Type=Canonical} - \p{U-1:Full_Composition_Exclusion}]] +$New_Decompositions ⊆ \p{U-1:GC=Cn} +# Stripping previously-unassigned characters from the current NFD does +# something, that is, the decomposition contains newly-assigned characters. +In $New_Decompositions toNFD * \P{U-1:GC=Cn} ≠ toNFD + +Let $Unicode_13_Decompositions = [[\p{U13.0.0:Decomposition_Type=Canonical} - \p{U13.0.0:Full_Composition_Exclusion}] - [\p{U12.1.0:Decomposition_Type=Canonical} - \p{U12.1.0:Full_Composition_Exclusion}]] +$Unicode_13_Decompositions ⊆ \p{U12.1.0:GC=Cn} +In $Unicode_13_Decompositions toNFD * \P{U12.1.0:GC=Cn} ≠ toNFD +$Unicode_13_Decompositions = [\U00011938] +$Unicode_13_Decompositions = [\p{Name=DIVES AKURU VOWEL SIGN O}] + +Let $Unicode_7_Decompositions = [[\p{U7.0.0:Decomposition_Type=Canonical} - \p{U7.0.0:Full_Composition_Exclusion}] - [\p{U6.3.0:Decomposition_Type=Canonical} - \p{U6.3.0:Full_Composition_Exclusion}]] +$Unicode_7_Decompositions ⊆ \p{U6.3.0:GC=Cn} +In $Unicode_7_Decompositions toNFD * \P{U6.3.0:GC=Cn} ≠ toNFD +$Unicode_7_Decompositions = [\U0001134B-\U0001134C \U000114BB-\U000114BC \U000114BE \U000115BA-\U000115BB] +$Unicode_7_Decompositions ⊆ [\p{Name=/^(GRANTHA|TIRHUTA|SIDDHAM) VOWEL SIGN /}] + +Let $Unicode_6_1_Decompositions = [[\p{U6.1.0:Decomposition_Type=Canonical} - \p{U6.1.0:Full_Composition_Exclusion}] - [\p{U6.0.0:Decomposition_Type=Canonical} - \p{U6.0.0:Full_Composition_Exclusion}]] +$Unicode_6_1_Decompositions ⊆ \p{U6.0.0:GC=Cn} +In $Unicode_6_1_Decompositions toNFD * \P{U6.0.0:GC=Cn} ≠ toNFD +$Unicode_6_1_Decompositions = [\U0001112E-\U0001112F] +$Unicode_6_1_Decompositions ⊆ [\p{Name=/^CHAKMA VOWEL SIGN /}] # Stability: All characters other than those with General_Category property values Spacing_Mark (Mc) and Nonspacing_Mark (Mn) have the Canonical_Combining_Class property value 0. \p{CCC=0} ⊇ [^ \p{GC=Mc} \p{GC=Mn}] @@ -512,7 +530,10 @@ Show [\u20b9] # exceptions. Should such exceptions arise, they can be added to the definition of # $nonAlphabeticBindus to avoid a failure on this test. Let $nonAlphabeticBindus = [] -[\p{InSc=Bindu} - $nonAlphabeticBindus - \p{Alphabetic}] = [] +[\p{InSc=Bindu} - \p{Alphabetic}] = $nonAlphabeticBindus + +Let $nonAlphabeticDependentVowels = [\N{ORIYA SIGN OVERLINE}\N{THAI CHARACTER MAITAIKHU}\N{LIMBU SIGN KEMPHRENG}\N{SHARADA VOWEL MODIFIER MARK}\N{SHARADA EXTRA SHORT VOWEL MARK}] +[\p{InSC=Vowel_Dependent} - \p{Alphabetic}] = $nonAlphabeticDependentVowels ########################## # LineBreak property