diff --git a/.github/workflows/cli-build-instructions.yml b/.github/workflows/cli-build-instructions.yml
index 3fd9e8e56..24d4dce1c 100644
--- a/.github/workflows/cli-build-instructions.yml
+++ b/.github/workflows/cli-build-instructions.yml
@@ -81,8 +81,14 @@ jobs:
run: |
mkdir -p Generated/BIN
- - name: Run command - Build and Test
- run: MAVEN_OPTS="-ea" mvn -s .github/workflows/mvn-settings.xml package -DCLDR_DIR=$(cd ../cldr ; pwd) -DUNICODETOOLS_GEN_DIR=$(cd Generated; pwd) -DUNICODETOOLS_REPO_DIR=$(pwd) -DUVERSION=$CURRENT_UVERSION
+ # Since these are just examples to smoke-test the in-source build process,
+ # let’s not run the whole build and test suite, which is quite slow (6 min
+ # 26 s as of this writing). Just run the invariant tests and smoke-test
+ # MakeUnicodeFiles. We don’t even check that MakeUnicodeFiles doesn’t
+ # change anything, which makes little sense; but that is the job of the
+ # other job.
+ - name: Run invariant tests
+ run: MAVEN_OPTS="-ea" mvn -s .github/workflows/mvn-settings.xml test -am -pl unicodetools -Dtest=TestTestUnicodeInvariants -DfailIfNoTests=false -DCLDR_DIR=$(cd ../cldr ; pwd) -DUNICODETOOLS_GEN_DIR=$(cd Generated; pwd) -DUNICODETOOLS_REPO_DIR=$(pwd) -DUVERSION=$CURRENT_UVERSION
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
@@ -91,14 +97,15 @@ jobs:
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- out-of-source-build:
- name: Out-of-source Instructions
+
+ # Out-of-source build.
+ ucd-and-smoke-tests:
+ name: Check UCD consistency, invariants, smoke-test generators
runs-on: ubuntu-latest
steps:
- name: Checkout Unicode Tools
uses: actions/checkout@v3
with:
- repository: unicode-org/unicodetools
path: unicodetools/mine/src
- name: Get the CLDR_REF from pom.xml
id: cldr_ref
@@ -136,6 +143,30 @@ jobs:
run: |
mkdir -p unicodetools/mine/Generated/BIN
+ - name: Run command - Make Unicode Files
+ run: |
+ cd unicodetools/mine/src
+ mvn -s .github/workflows/mvn-settings.xml compile exec:java -Dexec.mainClass="org.unicode.text.UCD.Main" -Dexec.args="version $CURRENT_UVERSION build MakeUnicodeFiles" -am -pl unicodetools -DCLDR_DIR=$(cd ../../../cldr/mine/src ; pwd) -DUNICODETOOLS_GEN_DIR=$(cd ../Generated ; pwd) -DUNICODETOOLS_REPO_DIR=$(pwd) -DUVERSION=$CURRENT_UVERSION
+ env:
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+ - name: Check that UCD files are consistent
+ run: |
+ cd unicodetools/mine/src
+ ./py/copygenerateducd.py --out-of-source -y
+ git diff --compact-summary --exit-code || {
+ git diff --compact-summary |
+ awk '{
+ if (previous) {
+ print "::error file="previous",title=File must be regenerated::Run org.unicode.text.UCD.Main build MakeUnicodeFiles and copy any changed files to unicodetools/data/ucd/dev."
+ }
+ previous=$1
+ }'
+ exit 1
+ }
+
+ # Only test once we know the UCD is internally consistent.
+ # MakeUnicodeFiles is much faster than this anyway.
- name: Run command - Build and Test
run: |
cd unicodetools/mine/src
@@ -151,13 +182,6 @@ jobs:
path: |
unicodetools/mine/Generated/UnicodeTestResults.*
- - name: Run command - Make Unicode Files
- run: |
- cd unicodetools/mine/src
- mvn -s .github/workflows/mvn-settings.xml compile exec:java -Dexec.mainClass="org.unicode.text.UCD.Main" -Dexec.args="version $CURRENT_UVERSION build MakeUnicodeFiles" -am -pl unicodetools -DCLDR_DIR=$(cd ../../../cldr/mine/src ; pwd) -DUNICODETOOLS_GEN_DIR=$(cd ../Generated ; pwd) -DUNICODETOOLS_REPO_DIR=$(pwd) -DUVERSION=$CURRENT_UVERSION
- env:
- GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
# https://github.com/unicode-org/unicodetools/blob/main/docs/emoji/aac.md#aacorderjava
- name: Run command - AAC Order
run: |
@@ -166,18 +190,6 @@ jobs:
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- # https://github.com/unicode-org/unicodetools/blob/main/docs/uca/index.md#tools--tests
- # Note: Not running desuffixucd.py in UCA jobs because no version numbers detected in data file names
- - name: Run command - UCA - collation validity log
- run: |
- cd unicodetools/mine/src
- # invoke main() in class ...UCA.Main
- mvn -s .github/workflows/mvn-settings.xml compile exec:java -Dexec.mainClass="org.unicode.text.UCA.Main" -Dexec.args="writeCollationValidityLog ICU" -am -pl unicodetools -DCLDR_DIR=$(cd ../../../cldr/mine/src ; pwd) -DUNICODETOOLS_GEN_DIR=$(cd ../Generated ; pwd) -DUNICODETOOLS_REPO_DIR=$(pwd) -DUVERSION=$CURRENT_UVERSION
- # check for output file
- compgen -G "../Generated/UCA/*/CheckCollationValidity.html"
- env:
- GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
# https://github.com/unicode-org/unicodetools/blob/main/docs/idna.md
- name: Run command - IDNA
run: |
@@ -252,3 +264,61 @@ jobs:
mvn -s .github/workflows/mvn-settings.xml -Dexec.mainClass="org.unicode.propstest.CheckProperties" -Dexec.classpathScope=test test-compile -Dexec.args="COMPARE ALL $PREVIOUS_UVERSION" compile exec:java -am -pl unicodetools -DCLDR_DIR=$(cd ../../../cldr/mine/src ; pwd) -DUNICODETOOLS_GEN_DIR=$(cd ../Generated ; pwd) -DUNICODETOOLS_REPO_DIR=$(pwd) -DUVERSION=$CURRENT_UVERSION
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+ # Out-of-source build.
+ uca:
+ name: Check UCA data
+ runs-on: ubuntu-latest
+ steps:
+ - name: Checkout Unicode Tools
+ uses: actions/checkout@v3
+ with:
+ repository: unicode-org/unicodetools
+ path: unicodetools/mine/src
+ - name: Get the CLDR_REF from pom.xml
+ id: cldr_ref
+ run: echo "CLDR_REF="$(mvn --file unicodetools/mine/src/pom.xml help:evaluate -Dexpression=cldr.version -q -DforceStdout | cut -d- -f3) >> $GITHUB_OUTPUT && cat ${GITHUB_OUTPUT}
+ - name: Verify CLDR checkout ref
+ run: echo CLDR_REF="${{ steps.cldr_ref.outputs.CLDR_REF }}" && [ "${{ steps.cldr_ref.outputs.CLDR_REF }}x" != "x" ] # fail if empty
+ - name: Cache CLDR repository
+ uses: actions/cache@v3
+ with:
+ path: cldr/mine/src
+ key: cldr-${{ steps.cldr_ref.outputs.CLDR_REF }}
+ restore-keys: |
+ cldr
+ - name: Check out CLDR
+ uses: actions/checkout@v3
+ with:
+ repository: unicode-org/cldr
+ path: cldr/mine/src
+ ref: main
+ fetch-depth: 0
+ - name: Switch CLDR to CLDR_REF
+ run: cd cldr/mine/src && git fetch && git checkout ${{ steps.cldr_ref.outputs.CLDR_REF }}
+ - name: Set up JDK 11
+ uses: actions/setup-java@v1
+ with:
+ java-version: 11
+ - name: Cache local Maven repository
+ uses: actions/cache@v2
+ with:
+ path: ~/.m2/repository
+ key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }}
+ restore-keys: |
+ ${{ runner.os }}-maven-
+ - name: Set up out-of-source output dir
+ run: |
+ mkdir -p unicodetools/mine/Generated/BIN
+
+ # https://github.com/unicode-org/unicodetools/blob/main/docs/uca/index.md#tools--tests
+ # Note: Not running desuffixucd.py in UCA jobs because no version numbers detected in data file names
+ - name: Run command - UCA - collation validity log
+ run: |
+ cd unicodetools/mine/src
+ # invoke main() in class ...UCA.Main
+ mvn -s .github/workflows/mvn-settings.xml compile exec:java -Dexec.mainClass="org.unicode.text.UCA.Main" -Dexec.args="writeCollationValidityLog ICU" -am -pl unicodetools -DCLDR_DIR=$(cd ../../../cldr/mine/src ; pwd) -DUNICODETOOLS_GEN_DIR=$(cd ../Generated ; pwd) -DUNICODETOOLS_REPO_DIR=$(pwd) -DUVERSION=$CURRENT_UVERSION
+ # check for output file
+ compgen -G "../Generated/UCA/*/CheckCollationValidity.html"
+ env:
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/UnicodeJsps/jetty.d/ROOT/robots.txt b/UnicodeJsps/jetty.d/ROOT/robots.txt
new file mode 100644
index 000000000..a40ff93be
--- /dev/null
+++ b/UnicodeJsps/jetty.d/ROOT/robots.txt
@@ -0,0 +1,2 @@
+User-agent: *
+Disallow: /UnicodeJsps
diff --git a/UnicodeJsps/pom.xml b/UnicodeJsps/pom.xml
index 98f0e75b2..83d01106f 100644
--- a/UnicodeJsps/pom.xml
+++ b/UnicodeJsps/pom.xml
@@ -65,7 +65,7 @@
com.google.guava
guava
- 29.0-jre
+ 32.0.0-jre
diff --git a/docs/unicodejsps/index.md b/docs/unicodejsps/index.md
index 477f20d08..c3d97f27c 100644
--- a/docs/unicodejsps/index.md
+++ b/docs/unicodejsps/index.md
@@ -1,11 +1,5 @@
# Building UnicodeJsp
-- Note: you can run the latest UnicodeJsp locally with docker using:
-
-```
-docker run --rm -p 8080:8080 unicode/unicode-jsp
-```
-
- Note 2: there are some notes on updated processes for using GCP at [gcp-run.md](./gcp-run.md) - at present, automated deployment is TODO.
## Compiling
@@ -113,7 +107,26 @@ Look at , and make sure that
there aren't any Z-Other props at the bottom (you'll need to update via Adding
New Properties if there are).
-(:construction: **TODO**: explain how to do a Docker-based build here.)
+### Running a Docker-based build
+
+compile java stuff
+
+- `mvn -B package -am -pl UnicodeJsps -DskipTests=true`
+
+”backup” copy of CLDR and UnicodeTools. (`~/src/cldr` is an optional existing CLDR dir to save a few packets)
+
+- `git clone --reference-if-able ~/src/cldr https://github.com/unicode-org/cldr.git || (cd cldr && git pull)`
+- `mkdir -p UnicodeJsps/target && tar -cpz --exclude=.git --exclude=unicodetools/target/ -f UnicodeJsps/target/cldr-unicodetools.tgz ./cldr/ ./unicodetools/`
+
+Now, finally build.
+
+- `docker build -t unicode/unicode-jsp:latest UnicodeJsps/`
+
+… And run. Control-C to cancel it, otherwise visit
+
+```
+docker run --rm -p 8080:8080 unicode/unicode-jsp:latest
+```
## Commit/PR
diff --git a/py/copygenerateducd.py b/py/copygenerateducd.py
old mode 100644
new mode 100755
index a1a8f2f73..1b64f116b
--- a/py/copygenerateducd.py
+++ b/py/copygenerateducd.py
@@ -17,9 +17,10 @@
def main():
+ out_of_source = '--out-of-source' in sys.argv[1:]
cwd = Path().cwd()
uversion = os.getenv("CURRENT_UVERSION")
- genucddir = cwd / "Generated" / "UCD" / uversion
+ genucddir = (cwd / ".." if out_of_source else cwd) / "Generated" / "UCD" / uversion
if not genucddir.exists():
raise Exception(f"Generated directory not found at {genucddir.absolute()}")
@@ -34,7 +35,7 @@ def main():
print("THE FOLLOWING FILES WILL BE MOVED:\n")
print("\n".join([f"{str(p.name)} --> {devucddir / p.relative_to(genucddir)}" for p in to_move])) # noqa: E501
- confirm = bool(sys.argv[-1] == "-y") # enable running this in automation
+ confirm = bool("-y" in sys.argv[1:]) # enable running this in automation
if not confirm:
confirm = input("\nProceed [y/N]?").lower() == "y"
diff --git a/unicodetools/data/ucd/dev/ArabicShaping.txt b/unicodetools/data/ucd/dev/ArabicShaping.txt
index dd8cb333e..0def17a03 100644
--- a/unicodetools/data/ucd/dev/ArabicShaping.txt
+++ b/unicodetools/data/ucd/dev/ArabicShaping.txt
@@ -828,6 +828,11 @@ A873; PHAGS-PA CANDRABINDU; U; No_Joining_Group
10D22; HANIFI ROHINGYA SAKIN; R; No_Joining_Group
10D23; HANIFI ROHINGYA DOTLESS KINNA YA WITH DOT ABOVE; D; HANIFI ROHINGYA KINNA YA
+# Arabic Extended-D Characters
+10EC2; DAL WITH VERTICAL 2 DOTS BELOW; R; DAL
+10EC3; TAH WITH VERTICAL 2 DOTS BELOW; D; TAH
+10EC4; KAF WITH VERTICAL 2 DOTS BELOW; D; KAF
+
# Sogdian Characters
10F30; SOGDIAN ALEPH; D; No_Joining_Group
diff --git a/unicodetools/data/ucd/dev/DerivedAge.txt b/unicodetools/data/ucd/dev/DerivedAge.txt
index 459b60773..6e4ff6d77 100644
--- a/unicodetools/data/ucd/dev/DerivedAge.txt
+++ b/unicodetools/data/ucd/dev/DerivedAge.txt
@@ -1,5 +1,5 @@
# DerivedAge-16.0.0.txt
-# Date: 2023-10-03, 19:01:23 GMT
+# Date: 2023-10-12, 18:06:06 GMT
# © 2023 Unicode®, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
# For terms of use, see https://www.unicode.org/terms_of_use.html
@@ -2009,8 +2009,10 @@ FDFE..FDFF ; 14.0 # [2] ARABIC LIGATURE SUBHAANAHU WA TAAALAA..ARABIC LIGAT
# Newly assigned in Unicode 16.0.0 (September, 2024)
+0897 ; 16.0 # ARABIC PEPET
1C89..1C8A ; 16.0 # [2] CYRILLIC CAPITAL LETTER TJE..CYRILLIC SMALL LETTER TJE
+10EC2..10EC4 ; 16.0 # [3] ARABIC LETTER DAL WITH TWO DOTS VERTICALLY BELOW..ARABIC LETTER KAF WITH TWO DOTS VERTICALLY BELOW
-# Total code points: 2
+# Total code points: 6
# EOF
diff --git a/unicodetools/data/ucd/dev/DerivedCoreProperties.txt b/unicodetools/data/ucd/dev/DerivedCoreProperties.txt
index ce72500d9..6ccf0fc4e 100644
--- a/unicodetools/data/ucd/dev/DerivedCoreProperties.txt
+++ b/unicodetools/data/ucd/dev/DerivedCoreProperties.txt
@@ -1,5 +1,5 @@
# DerivedCoreProperties-16.0.0.txt
-# Date: 2023-10-03, 19:26:36 GMT
+# Date: 2023-10-12, 18:06:57 GMT
# © 2023 Unicode®, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
# For terms of use, see https://www.unicode.org/terms_of_use.html
@@ -343,6 +343,7 @@ FFE9..FFEC ; Math # Sm [4] HALFWIDTH LEFTWARDS ARROW..HALFWIDTH DOWNWARDS A
0860..086A ; Alphabetic # Lo [11] SYRIAC LETTER MALAYALAM NGA..SYRIAC LETTER MALAYALAM SSA
0870..0887 ; Alphabetic # Lo [24] ARABIC LETTER ALEF WITH ATTACHED FATHA..ARABIC BASELINE ROUND DOT
0889..088E ; Alphabetic # Lo [6] ARABIC LETTER NOON WITH INVERTED SMALL V..ARABIC VERTICAL TAIL
+0897 ; Alphabetic # Mn ARABIC PEPET
08A0..08C8 ; Alphabetic # Lo [41] ARABIC LETTER BEH WITH SMALL V BELOW..ARABIC LETTER GRAF
08C9 ; Alphabetic # Lm ARABIC SMALL FARSI YEH
08D4..08DF ; Alphabetic # Mn [12] ARABIC SMALL HIGH WORD AR-RUB..ARABIC SMALL HIGH WORD WAQFA
@@ -1041,6 +1042,7 @@ FFDA..FFDC ; Alphabetic # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANG
10E80..10EA9 ; Alphabetic # Lo [42] YEZIDI LETTER ELIF..YEZIDI LETTER ET
10EAB..10EAC ; Alphabetic # Mn [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK
10EB0..10EB1 ; Alphabetic # Lo [2] YEZIDI LETTER LAM WITH DOT ABOVE..YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE
+10EC2..10EC4 ; Alphabetic # Lo [3] ARABIC LETTER DAL WITH TWO DOTS VERTICALLY BELOW..ARABIC LETTER KAF WITH TWO DOTS VERTICALLY BELOW
10F00..10F1C ; Alphabetic # Lo [29] OLD SOGDIAN LETTER ALEPH..OLD SOGDIAN LETTER FINAL TAW WITH VERTICAL TAIL
10F27 ; Alphabetic # Lo OLD SOGDIAN LIGATURE AYIN-DALETH
10F30..10F45 ; Alphabetic # Lo [22] SOGDIAN LETTER ALEPH..SOGDIAN INDEPENDENT SHIN
@@ -1402,7 +1404,7 @@ FFDA..FFDC ; Alphabetic # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANG
30000..3134A ; Alphabetic # Lo [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A
31350..323AF ; Alphabetic # Lo [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF
-# Total code points: 138389
+# Total code points: 138393
# ================================================
@@ -3017,7 +3019,7 @@ FF41..FF5A ; Cased # L& [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN
0859..085B ; Case_Ignorable # Mn [3] MANDAIC AFFRICATION MARK..MANDAIC GEMINATION MARK
0888 ; Case_Ignorable # Sk ARABIC RAISED ROUND DOT
0890..0891 ; Case_Ignorable # Cf [2] ARABIC POUND MARK ABOVE..ARABIC PIASTRE MARK ABOVE
-0898..089F ; Case_Ignorable # Mn [8] ARABIC SMALL HIGH WORD AL-JUZ..ARABIC HALF MADDA OVER MADDA
+0897..089F ; Case_Ignorable # Mn [9] ARABIC PEPET..ARABIC HALF MADDA OVER MADDA
08C9 ; Case_Ignorable # Lm ARABIC SMALL FARSI YEH
08CA..08E1 ; Case_Ignorable # Mn [24] ARABIC SMALL HIGH FARSI YEH..ARABIC SMALL HIGH SIGN SAFHA
08E2 ; Case_Ignorable # Cf ARABIC DISPUTED END OF AYAH
@@ -3442,7 +3444,7 @@ E0001 ; Case_Ignorable # Cf LANGUAGE TAG
E0020..E007F ; Case_Ignorable # Cf [96] TAG SPACE..CANCEL TAG
E0100..E01EF ; Case_Ignorable # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
-# Total code points: 2707
+# Total code points: 2708
# ================================================
@@ -6646,6 +6648,7 @@ FFDA..FFDC ; ID_Start # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL
10D00..10D23 ; ID_Start # Lo [36] HANIFI ROHINGYA LETTER A..HANIFI ROHINGYA MARK NA KHONNA
10E80..10EA9 ; ID_Start # Lo [42] YEZIDI LETTER ELIF..YEZIDI LETTER ET
10EB0..10EB1 ; ID_Start # Lo [2] YEZIDI LETTER LAM WITH DOT ABOVE..YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE
+10EC2..10EC4 ; ID_Start # Lo [3] ARABIC LETTER DAL WITH TWO DOTS VERTICALLY BELOW..ARABIC LETTER KAF WITH TWO DOTS VERTICALLY BELOW
10F00..10F1C ; ID_Start # Lo [29] OLD SOGDIAN LETTER ALEPH..OLD SOGDIAN LETTER FINAL TAW WITH VERTICAL TAIL
10F27 ; ID_Start # Lo OLD SOGDIAN LIGATURE AYIN-DALETH
10F30..10F45 ; ID_Start # Lo [22] SOGDIAN LETTER ALEPH..SOGDIAN INDEPENDENT SHIN
@@ -6864,7 +6867,7 @@ FFDA..FFDC ; ID_Start # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL
30000..3134A ; ID_Start # Lo [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A
31350..323AF ; ID_Start # Lo [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF
-# Total code points: 136969
+# Total code points: 136972
# ================================================
@@ -6971,7 +6974,7 @@ FFDA..FFDC ; ID_Start # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL
0860..086A ; ID_Continue # Lo [11] SYRIAC LETTER MALAYALAM NGA..SYRIAC LETTER MALAYALAM SSA
0870..0887 ; ID_Continue # Lo [24] ARABIC LETTER ALEF WITH ATTACHED FATHA..ARABIC BASELINE ROUND DOT
0889..088E ; ID_Continue # Lo [6] ARABIC LETTER NOON WITH INVERTED SMALL V..ARABIC VERTICAL TAIL
-0898..089F ; ID_Continue # Mn [8] ARABIC SMALL HIGH WORD AL-JUZ..ARABIC HALF MADDA OVER MADDA
+0897..089F ; ID_Continue # Mn [9] ARABIC PEPET..ARABIC HALF MADDA OVER MADDA
08A0..08C8 ; ID_Continue # Lo [41] ARABIC LETTER BEH WITH SMALL V BELOW..ARABIC LETTER GRAF
08C9 ; ID_Continue # Lm ARABIC SMALL FARSI YEH
08CA..08E1 ; ID_Continue # Mn [24] ARABIC SMALL HIGH FARSI YEH..ARABIC SMALL HIGH SIGN SAFHA
@@ -7787,6 +7790,7 @@ FFDA..FFDC ; ID_Continue # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HAN
10E80..10EA9 ; ID_Continue # Lo [42] YEZIDI LETTER ELIF..YEZIDI LETTER ET
10EAB..10EAC ; ID_Continue # Mn [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK
10EB0..10EB1 ; ID_Continue # Lo [2] YEZIDI LETTER LAM WITH DOT ABOVE..YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE
+10EC2..10EC4 ; ID_Continue # Lo [3] ARABIC LETTER DAL WITH TWO DOTS VERTICALLY BELOW..ARABIC LETTER KAF WITH TWO DOTS VERTICALLY BELOW
10EFD..10EFF ; ID_Continue # Mn [3] ARABIC SMALL LOW WORD SAKTA..ARABIC SMALL LOW WORD MADDA
10F00..10F1C ; ID_Continue # Lo [29] OLD SOGDIAN LETTER ALEPH..OLD SOGDIAN LETTER FINAL TAW WITH VERTICAL TAIL
10F27 ; ID_Continue # Lo OLD SOGDIAN LIGATURE AYIN-DALETH
@@ -8223,7 +8227,7 @@ FFDA..FFDC ; ID_Continue # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HAN
31350..323AF ; ID_Continue # Lo [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF
E0100..E01EF ; ID_Continue # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
-# Total code points: 140110
+# Total code points: 140114
# ================================================
@@ -8760,6 +8764,7 @@ FFDA..FFDC ; XID_Start # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGU
10D00..10D23 ; XID_Start # Lo [36] HANIFI ROHINGYA LETTER A..HANIFI ROHINGYA MARK NA KHONNA
10E80..10EA9 ; XID_Start # Lo [42] YEZIDI LETTER ELIF..YEZIDI LETTER ET
10EB0..10EB1 ; XID_Start # Lo [2] YEZIDI LETTER LAM WITH DOT ABOVE..YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE
+10EC2..10EC4 ; XID_Start # Lo [3] ARABIC LETTER DAL WITH TWO DOTS VERTICALLY BELOW..ARABIC LETTER KAF WITH TWO DOTS VERTICALLY BELOW
10F00..10F1C ; XID_Start # Lo [29] OLD SOGDIAN LETTER ALEPH..OLD SOGDIAN LETTER FINAL TAW WITH VERTICAL TAIL
10F27 ; XID_Start # Lo OLD SOGDIAN LIGATURE AYIN-DALETH
10F30..10F45 ; XID_Start # Lo [22] SOGDIAN LETTER ALEPH..SOGDIAN INDEPENDENT SHIN
@@ -8978,7 +8983,7 @@ FFDA..FFDC ; XID_Start # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGU
30000..3134A ; XID_Start # Lo [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A
31350..323AF ; XID_Start # Lo [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF
-# Total code points: 136946
+# Total code points: 136949
# ================================================
@@ -9081,7 +9086,7 @@ FFDA..FFDC ; XID_Start # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGU
0860..086A ; XID_Continue # Lo [11] SYRIAC LETTER MALAYALAM NGA..SYRIAC LETTER MALAYALAM SSA
0870..0887 ; XID_Continue # Lo [24] ARABIC LETTER ALEF WITH ATTACHED FATHA..ARABIC BASELINE ROUND DOT
0889..088E ; XID_Continue # Lo [6] ARABIC LETTER NOON WITH INVERTED SMALL V..ARABIC VERTICAL TAIL
-0898..089F ; XID_Continue # Mn [8] ARABIC SMALL HIGH WORD AL-JUZ..ARABIC HALF MADDA OVER MADDA
+0897..089F ; XID_Continue # Mn [9] ARABIC PEPET..ARABIC HALF MADDA OVER MADDA
08A0..08C8 ; XID_Continue # Lo [41] ARABIC LETTER BEH WITH SMALL V BELOW..ARABIC LETTER GRAF
08C9 ; XID_Continue # Lm ARABIC SMALL FARSI YEH
08CA..08E1 ; XID_Continue # Mn [24] ARABIC SMALL HIGH FARSI YEH..ARABIC SMALL HIGH SIGN SAFHA
@@ -9902,6 +9907,7 @@ FFDA..FFDC ; XID_Continue # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HA
10E80..10EA9 ; XID_Continue # Lo [42] YEZIDI LETTER ELIF..YEZIDI LETTER ET
10EAB..10EAC ; XID_Continue # Mn [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK
10EB0..10EB1 ; XID_Continue # Lo [2] YEZIDI LETTER LAM WITH DOT ABOVE..YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE
+10EC2..10EC4 ; XID_Continue # Lo [3] ARABIC LETTER DAL WITH TWO DOTS VERTICALLY BELOW..ARABIC LETTER KAF WITH TWO DOTS VERTICALLY BELOW
10EFD..10EFF ; XID_Continue # Mn [3] ARABIC SMALL LOW WORD SAKTA..ARABIC SMALL LOW WORD MADDA
10F00..10F1C ; XID_Continue # Lo [29] OLD SOGDIAN LETTER ALEPH..OLD SOGDIAN LETTER FINAL TAW WITH VERTICAL TAIL
10F27 ; XID_Continue # Lo OLD SOGDIAN LIGATURE AYIN-DALETH
@@ -10338,7 +10344,7 @@ FFDA..FFDC ; XID_Continue # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HA
31350..323AF ; XID_Continue # Lo [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF
E0100..E01EF ; XID_Continue # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
-# Total code points: 140091
+# Total code points: 140095
# ================================================
@@ -10423,7 +10429,7 @@ E01F0..E0FFF ; Default_Ignorable_Code_Point # Cn [3600] ....
0825..0827 ; Extend # Mn [3] SAMARITAN VOWEL SIGN SHORT A..SAMARITAN VOWEL SIGN U
0829..082D ; Extend # Mn [5] SAMARITAN VOWEL SIGN LONG I..SAMARITAN MARK NEQUDAA
0859..085B ; Extend # Mn [3] MANDAIC AFFRICATION MARK..MANDAIC GEMINATION MARK
-0898..089F ; Extend # Mn [8] ARABIC SMALL HIGH WORD AL-JUZ..ARABIC HALF MADDA OVER MADDA
+0897..089F ; Extend # Mn [9] ARABIC PEPET..ARABIC HALF MADDA OVER MADDA
08CA..08E1 ; Extend # Mn [24] ARABIC SMALL HIGH FARSI YEH..ARABIC SMALL HIGH SIGN SAFHA
08E3..0902 ; Extend # Mn [32] ARABIC TURNED DAMMA BELOW..DEVANAGARI SIGN ANUSVARA
093A ; Extend # Mn DEVANAGARI VOWEL SIGN OE
@@ -459,7 +459,7 @@ FF9E..FF9F ; Extend # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDT
E0020..E007F ; Extend # Cf [96] TAG SPACE..CANCEL TAG
E0100..E01EF ; Extend # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
-# Total code points: 2130
+# Total code points: 2131
# ================================================
diff --git a/unicodetools/data/ucd/dev/auxiliary/SentenceBreakProperty.txt b/unicodetools/data/ucd/dev/auxiliary/SentenceBreakProperty.txt
index 2c6778fd8..20d4b45a6 100644
--- a/unicodetools/data/ucd/dev/auxiliary/SentenceBreakProperty.txt
+++ b/unicodetools/data/ucd/dev/auxiliary/SentenceBreakProperty.txt
@@ -1,5 +1,5 @@
# SentenceBreakProperty-16.0.0.txt
-# Date: 2023-10-03, 19:02:47 GMT
+# Date: 2023-10-12, 18:07:56 GMT
# © 2023 Unicode®, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
# For terms of use, see https://www.unicode.org/terms_of_use.html
@@ -55,7 +55,7 @@
0825..0827 ; Extend # Mn [3] SAMARITAN VOWEL SIGN SHORT A..SAMARITAN VOWEL SIGN U
0829..082D ; Extend # Mn [5] SAMARITAN VOWEL SIGN LONG I..SAMARITAN MARK NEQUDAA
0859..085B ; Extend # Mn [3] MANDAIC AFFRICATION MARK..MANDAIC GEMINATION MARK
-0898..089F ; Extend # Mn [8] ARABIC SMALL HIGH WORD AL-JUZ..ARABIC HALF MADDA OVER MADDA
+0897..089F ; Extend # Mn [9] ARABIC PEPET..ARABIC HALF MADDA OVER MADDA
08CA..08E1 ; Extend # Mn [24] ARABIC SMALL HIGH FARSI YEH..ARABIC SMALL HIGH SIGN SAFHA
08E3..0902 ; Extend # Mn [32] ARABIC TURNED DAMMA BELOW..DEVANAGARI SIGN ANUSVARA
0903 ; Extend # Mc DEVANAGARI SIGN VISARGA
@@ -567,7 +567,7 @@ FF9E..FF9F ; Extend # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDT
E0020..E007F ; Extend # Cf [96] TAG SPACE..CANCEL TAG
E0100..E01EF ; Extend # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
-# Total code points: 2550
+# Total code points: 2551
# ================================================
@@ -2357,6 +2357,7 @@ FFDA..FFDC ; OLetter # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL
10D00..10D23 ; OLetter # Lo [36] HANIFI ROHINGYA LETTER A..HANIFI ROHINGYA MARK NA KHONNA
10E80..10EA9 ; OLetter # Lo [42] YEZIDI LETTER ELIF..YEZIDI LETTER ET
10EB0..10EB1 ; OLetter # Lo [2] YEZIDI LETTER LAM WITH DOT ABOVE..YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE
+10EC2..10EC4 ; OLetter # Lo [3] ARABIC LETTER DAL WITH TWO DOTS VERTICALLY BELOW..ARABIC LETTER KAF WITH TWO DOTS VERTICALLY BELOW
10F00..10F1C ; OLetter # Lo [29] OLD SOGDIAN LETTER ALEPH..OLD SOGDIAN LETTER FINAL TAW WITH VERTICAL TAIL
10F27 ; OLetter # Lo OLD SOGDIAN LIGATURE AYIN-DALETH
10F30..10F45 ; OLetter # Lo [22] SOGDIAN LETTER ALEPH..SOGDIAN INDEPENDENT SHIN
@@ -2538,7 +2539,7 @@ FFDA..FFDC ; OLetter # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL
30000..3134A ; OLetter # Lo [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A
31350..323AF ; OLetter # Lo [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF
-# Total code points: 132658
+# Total code points: 132661
# ================================================
diff --git a/unicodetools/data/ucd/dev/auxiliary/WordBreakProperty.txt b/unicodetools/data/ucd/dev/auxiliary/WordBreakProperty.txt
index 9cce2a126..0a602a559 100644
--- a/unicodetools/data/ucd/dev/auxiliary/WordBreakProperty.txt
+++ b/unicodetools/data/ucd/dev/auxiliary/WordBreakProperty.txt
@@ -1,5 +1,5 @@
# WordBreakProperty-16.0.0.txt
-# Date: 2023-10-03, 19:02:50 GMT
+# Date: 2023-10-12, 18:08:02 GMT
# © 2023 Unicode®, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
# For terms of use, see https://www.unicode.org/terms_of_use.html
@@ -91,7 +91,7 @@ FB46..FB4F ; Hebrew_Letter # Lo [10] HEBREW LETTER TSADI WITH DAGESH..HEBREW
0825..0827 ; Extend # Mn [3] SAMARITAN VOWEL SIGN SHORT A..SAMARITAN VOWEL SIGN U
0829..082D ; Extend # Mn [5] SAMARITAN VOWEL SIGN LONG I..SAMARITAN MARK NEQUDAA
0859..085B ; Extend # Mn [3] MANDAIC AFFRICATION MARK..MANDAIC GEMINATION MARK
-0898..089F ; Extend # Mn [8] ARABIC SMALL HIGH WORD AL-JUZ..ARABIC HALF MADDA OVER MADDA
+0897..089F ; Extend # Mn [9] ARABIC PEPET..ARABIC HALF MADDA OVER MADDA
08CA..08E1 ; Extend # Mn [24] ARABIC SMALL HIGH FARSI YEH..ARABIC SMALL HIGH SIGN SAFHA
08E3..0902 ; Extend # Mn [32] ARABIC TURNED DAMMA BELOW..DEVANAGARI SIGN ANUSVARA
0903 ; Extend # Mc DEVANAGARI SIGN VISARGA
@@ -604,7 +604,7 @@ FF9E..FF9F ; Extend # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDT
E0020..E007F ; Extend # Cf [96] TAG SPACE..CANCEL TAG
E0100..E01EF ; Extend # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
-# Total code points: 2554
+# Total code points: 2555
# ================================================
@@ -1115,6 +1115,7 @@ FFDA..FFDC ; ALetter # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL
10D00..10D23 ; ALetter # Lo [36] HANIFI ROHINGYA LETTER A..HANIFI ROHINGYA MARK NA KHONNA
10E80..10EA9 ; ALetter # Lo [42] YEZIDI LETTER ELIF..YEZIDI LETTER ET
10EB0..10EB1 ; ALetter # Lo [2] YEZIDI LETTER LAM WITH DOT ABOVE..YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE
+10EC2..10EC4 ; ALetter # Lo [3] ARABIC LETTER DAL WITH TWO DOTS VERTICALLY BELOW..ARABIC LETTER KAF WITH TWO DOTS VERTICALLY BELOW
10F00..10F1C ; ALetter # Lo [29] OLD SOGDIAN LETTER ALEPH..OLD SOGDIAN LETTER FINAL TAW WITH VERTICAL TAIL
10F27 ; ALetter # Lo OLD SOGDIAN LIGATURE AYIN-DALETH
10F30..10F45 ; ALetter # Lo [22] SOGDIAN LETTER ALEPH..SOGDIAN INDEPENDENT SHIN
@@ -1313,7 +1314,7 @@ FFDA..FFDC ; ALetter # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL
1F150..1F169 ; ALetter # So [26] NEGATIVE CIRCLED LATIN CAPITAL LETTER A..NEGATIVE CIRCLED LATIN CAPITAL LETTER Z
1F170..1F189 ; ALetter # So [26] NEGATIVE SQUARED LATIN CAPITAL LETTER A..NEGATIVE SQUARED LATIN CAPITAL LETTER Z
-# Total code points: 29492
+# Total code points: 29495
# ================================================
diff --git a/unicodetools/data/ucd/dev/extracted/DerivedBidiClass.txt b/unicodetools/data/ucd/dev/extracted/DerivedBidiClass.txt
index 6a1cb9b72..897d379c6 100644
--- a/unicodetools/data/ucd/dev/extracted/DerivedBidiClass.txt
+++ b/unicodetools/data/ucd/dev/extracted/DerivedBidiClass.txt
@@ -1,5 +1,5 @@
# DerivedBidiClass-16.0.0.txt
-# Date: 2023-10-03, 19:02:02 GMT
+# Date: 2023-10-12, 18:06:52 GMT
# © 2023 Unicode®, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
# For terms of use, see https://www.unicode.org/terms_of_use.html
@@ -2023,7 +2023,7 @@ FFFFE..FFFFF ; BN # Cn [2] ..
0825..0827 ; NSM # Mn [3] SAMARITAN VOWEL SIGN SHORT A..SAMARITAN VOWEL SIGN U
0829..082D ; NSM # Mn [5] SAMARITAN VOWEL SIGN LONG I..SAMARITAN MARK NEQUDAA
0859..085B ; NSM # Mn [3] MANDAIC AFFRICATION MARK..MANDAIC GEMINATION MARK
-0898..089F ; NSM # Mn [8] ARABIC SMALL HIGH WORD AL-JUZ..ARABIC HALF MADDA OVER MADDA
+0897..089F ; NSM # Mn [9] ARABIC PEPET..ARABIC HALF MADDA OVER MADDA
08CA..08E1 ; NSM # Mn [24] ARABIC SMALL HIGH FARSI YEH..ARABIC SMALL HIGH SIGN SAFHA
08E3..0902 ; NSM # Mn [32] ARABIC TURNED DAMMA BELOW..DEVANAGARI SIGN ANUSVARA
093A ; NSM # Mn DEVANAGARI VOWEL SIGN OE
@@ -2348,7 +2348,7 @@ FE20..FE2F ; NSM # Mn [16] COMBINING LIGATURE LEFT HALF..COMBINING CYRILLIC
1E944..1E94A ; NSM # Mn [7] ADLAM ALIF LENGTHENER..ADLAM NUKTA
E0100..E01EF ; NSM # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
-# Total code points: 1993
+# Total code points: 1994
# ================================================
@@ -2395,6 +2395,7 @@ FDFC ; AL # Sc RIAL SIGN
FE70..FE74 ; AL # Lo [5] ARABIC FATHATAN ISOLATED FORM..ARABIC KASRATAN ISOLATED FORM
FE76..FEFC ; AL # Lo [135] ARABIC FATHA ISOLATED FORM..ARABIC LIGATURE LAM WITH ALEF FINAL FORM
10D00..10D23 ; AL # Lo [36] HANIFI ROHINGYA LETTER A..HANIFI ROHINGYA MARK NA KHONNA
+10EC2..10EC4 ; AL # Lo [3] ARABIC LETTER DAL WITH TWO DOTS VERTICALLY BELOW..ARABIC LETTER KAF WITH TWO DOTS VERTICALLY BELOW
10F30..10F45 ; AL # Lo [22] SOGDIAN LETTER ALEPH..SOGDIAN INDEPENDENT SHIN
10F51..10F54 ; AL # No [4] SOGDIAN NUMBER ONE..SOGDIAN NUMBER ONE HUNDRED
10F55..10F59 ; AL # Po [5] SOGDIAN PUNCTUATION TWO VERTICAL BARS..SOGDIAN PUNCTUATION HALF CIRCLE WITH DOT
@@ -2440,8 +2441,8 @@ FE76..FEFC ; AL # Lo [135] ARABIC FATHA ISOLATED FORM..ARABIC LIGATURE LAM WI
1EEA5..1EEA9 ; AL # Lo [5] ARABIC MATHEMATICAL DOUBLE-STRUCK WAW..ARABIC MATHEMATICAL DOUBLE-STRUCK YEH
1EEAB..1EEBB ; AL # Lo [17] ARABIC MATHEMATICAL DOUBLE-STRUCK LAM..ARABIC MATHEMATICAL DOUBLE-STRUCK GHAIN
-# The above property value applies to 298 code points not listed here.
-# Total code points: 1769
+# The above property value applies to 294 code points not listed here.
+# Total code points: 1768
# ================================================
diff --git a/unicodetools/data/ucd/dev/extracted/DerivedCombiningClass.txt b/unicodetools/data/ucd/dev/extracted/DerivedCombiningClass.txt
index 2e2b401d3..ae815d9be 100644
--- a/unicodetools/data/ucd/dev/extracted/DerivedCombiningClass.txt
+++ b/unicodetools/data/ucd/dev/extracted/DerivedCombiningClass.txt
@@ -1,5 +1,5 @@
# DerivedCombiningClass-16.0.0.txt
-# Date: 2023-10-03, 19:02:05 GMT
+# Date: 2023-10-12, 18:06:56 GMT
# © 2023 Unicode®, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
# For terms of use, see https://www.unicode.org/terms_of_use.html
@@ -1463,6 +1463,7 @@ FFFC..FFFD ; 0 # So [2] OBJECT REPLACEMENT CHARACTER..REPLACEMENT CHARACTER
10E80..10EA9 ; 0 # Lo [42] YEZIDI LETTER ELIF..YEZIDI LETTER ET
10EAD ; 0 # Pd YEZIDI HYPHENATION MARK
10EB0..10EB1 ; 0 # Lo [2] YEZIDI LETTER LAM WITH DOT ABOVE..YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE
+10EC2..10EC4 ; 0 # Lo [3] ARABIC LETTER DAL WITH TWO DOTS VERTICALLY BELOW..ARABIC LETTER KAF WITH TWO DOTS VERTICALLY BELOW
10F00..10F1C ; 0 # Lo [29] OLD SOGDIAN LETTER ALEPH..OLD SOGDIAN LETTER FINAL TAW WITH VERTICAL TAIL
10F1D..10F26 ; 0 # No [10] OLD SOGDIAN NUMBER ONE..OLD SOGDIAN FRACTION ONE HALF
10F27 ; 0 # Lo OLD SOGDIAN LIGATURE AYIN-DALETH
@@ -2005,8 +2006,8 @@ E0100..E01EF ; 0 # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
F0000..FFFFD ; 0 # Co [65534] ..
100000..10FFFD; 0 # Co [65534] ..
-# The above property value applies to 826764 code points not listed here.
-# Total code points: 1113190
+# The above property value applies to 826760 code points not listed here.
+# Total code points: 1113189
# ================================================
@@ -2652,7 +2653,7 @@ FE27..FE2D ; 220 # Mn [7] COMBINING LIGATURE LEFT HALF BELOW..COMBINING CON
081B..0823 ; 230 # Mn [9] SAMARITAN MARK EPENTHETIC YUT..SAMARITAN VOWEL SIGN A
0825..0827 ; 230 # Mn [3] SAMARITAN VOWEL SIGN SHORT A..SAMARITAN VOWEL SIGN U
0829..082D ; 230 # Mn [5] SAMARITAN VOWEL SIGN LONG I..SAMARITAN MARK NEQUDAA
-0898 ; 230 # Mn ARABIC SMALL HIGH WORD AL-JUZ
+0897..0898 ; 230 # Mn [2] ARABIC PEPET..ARABIC SMALL HIGH WORD AL-JUZ
089C..089F ; 230 # Mn [4] ARABIC MADDA WAAJIB..ARABIC HALF MADDA OVER MADDA
08CA..08CE ; 230 # Mn [5] ARABIC SMALL HIGH FARSI YEH..ARABIC LARGE ROUND DOT ABOVE
08D4..08E1 ; 230 # Mn [14] ARABIC SMALL HIGH WORD AR-RUB..ARABIC SMALL HIGH SIGN SAFHA
@@ -2741,7 +2742,7 @@ FE2E..FE2F ; 230 # Mn [2] COMBINING CYRILLIC TITLO LEFT HALF..COMBINING CYR
1E4EF ; 230 # Mn NAG MUNDARI SIGN SUTUH
1E944..1E949 ; 230 # Mn [6] ADLAM ALIF LENGTHENER..ADLAM GEMINATE CONSONANT MODIFIER
-# Total code points: 510
+# Total code points: 511
# ================================================
diff --git a/unicodetools/data/ucd/dev/extracted/DerivedEastAsianWidth.txt b/unicodetools/data/ucd/dev/extracted/DerivedEastAsianWidth.txt
index 2d272278a..547a3853b 100644
--- a/unicodetools/data/ucd/dev/extracted/DerivedEastAsianWidth.txt
+++ b/unicodetools/data/ucd/dev/extracted/DerivedEastAsianWidth.txt
@@ -1,5 +1,5 @@
# DerivedEastAsianWidth-16.0.0.txt
-# Date: 2023-10-03, 19:02:08 GMT
+# Date: 2023-10-12, 18:07:00 GMT
# © 2023 Unicode®, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
# For terms of use, see https://www.unicode.org/terms_of_use.html
@@ -225,7 +225,7 @@
0888 ; N # Sk ARABIC RAISED ROUND DOT
0889..088E ; N # Lo [6] ARABIC LETTER NOON WITH INVERTED SMALL V..ARABIC VERTICAL TAIL
0890..0891 ; N # Cf [2] ARABIC POUND MARK ABOVE..ARABIC PIASTRE MARK ABOVE
-0898..089F ; N # Mn [8] ARABIC SMALL HIGH WORD AL-JUZ..ARABIC HALF MADDA OVER MADDA
+0897..089F ; N # Mn [9] ARABIC PEPET..ARABIC HALF MADDA OVER MADDA
08A0..08C8 ; N # Lo [41] ARABIC LETTER BEH WITH SMALL V BELOW..ARABIC LETTER GRAF
08C9 ; N # Lm ARABIC SMALL FARSI YEH
08CA..08E1 ; N # Mn [24] ARABIC SMALL HIGH FARSI YEH..ARABIC SMALL HIGH SIGN SAFHA
@@ -1472,6 +1472,7 @@ FFFC ; N # So OBJECT REPLACEMENT CHARACTER
10EAB..10EAC ; N # Mn [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK
10EAD ; N # Pd YEZIDI HYPHENATION MARK
10EB0..10EB1 ; N # Lo [2] YEZIDI LETTER LAM WITH DOT ABOVE..YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE
+10EC2..10EC4 ; N # Lo [3] ARABIC LETTER DAL WITH TWO DOTS VERTICALLY BELOW..ARABIC LETTER KAF WITH TWO DOTS VERTICALLY BELOW
10EFD..10EFF ; N # Mn [3] ARABIC SMALL LOW WORD SAKTA..ARABIC SMALL LOW WORD MADDA
10F00..10F1C ; N # Lo [29] OLD SOGDIAN LETTER ALEPH..OLD SOGDIAN LETTER FINAL TAW WITH VERTICAL TAIL
10F1D..10F26 ; N # No [10] OLD SOGDIAN NUMBER ONE..OLD SOGDIAN FRACTION ONE HALF
@@ -2043,7 +2044,7 @@ FFFC ; N # So OBJECT REPLACEMENT CHARACTER
E0001 ; N # Cf LANGUAGE TAG
E0020..E007F ; N # Cf [96] TAG SPACE..CANCEL TAG
-# The above property value applies to 766282 code points not listed here.
+# The above property value applies to 766278 code points not listed here.
# Total code points: 792618
# ================================================
diff --git a/unicodetools/data/ucd/dev/extracted/DerivedGeneralCategory.txt b/unicodetools/data/ucd/dev/extracted/DerivedGeneralCategory.txt
index c01c5b904..c0ceaaf60 100644
--- a/unicodetools/data/ucd/dev/extracted/DerivedGeneralCategory.txt
+++ b/unicodetools/data/ucd/dev/extracted/DerivedGeneralCategory.txt
@@ -1,5 +1,5 @@
# DerivedGeneralCategory-16.0.0.txt
-# Date: 2023-10-03, 19:02:09 GMT
+# Date: 2023-10-12, 18:07:01 GMT
# © 2023 Unicode®, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
# For terms of use, see https://www.unicode.org/terms_of_use.html
@@ -37,7 +37,7 @@
085F ; Cn #
086B..086F ; Cn # [5] ..
088F ; Cn #
-0892..0897 ; Cn # [6] ..
+0892..0896 ; Cn # [5] ..
0984 ; Cn #
098D..098E ; Cn # [2] ..
0991..0992 ; Cn # [2] ..
@@ -435,7 +435,8 @@ FFFE..FFFF ; Cn # [2] ..
10E7F ; Cn #
10EAA ; Cn #
10EAE..10EAF ; Cn # [2] ..
-10EB2..10EFC ; Cn # [75] ..
+10EB2..10EC1 ; Cn # [16] ..
+10EC5..10EFC ; Cn # [56] ..
10F28..10F2F ; Cn # [8] ..
10F5A..10F6F ; Cn # [22] ..
10F8A..10FAF ; Cn # [38] ..
@@ -723,7 +724,7 @@ E01F0..EFFFF ; Cn # [65040] ..
FFFFE..FFFFF ; Cn # [2] ..
10FFFE..10FFFF; Cn # [2] ..
-# Total code points: 824716
+# Total code points: 824712
# ================================================
@@ -2486,6 +2487,7 @@ FFDA..FFDC ; Lo # [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I
10D00..10D23 ; Lo # [36] HANIFI ROHINGYA LETTER A..HANIFI ROHINGYA MARK NA KHONNA
10E80..10EA9 ; Lo # [42] YEZIDI LETTER ELIF..YEZIDI LETTER ET
10EB0..10EB1 ; Lo # [2] YEZIDI LETTER LAM WITH DOT ABOVE..YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE
+10EC2..10EC4 ; Lo # [3] ARABIC LETTER DAL WITH TWO DOTS VERTICALLY BELOW..ARABIC LETTER KAF WITH TWO DOTS VERTICALLY BELOW
10F00..10F1C ; Lo # [29] OLD SOGDIAN LETTER ALEPH..OLD SOGDIAN LETTER FINAL TAW WITH VERTICAL TAIL
10F27 ; Lo # OLD SOGDIAN LIGATURE AYIN-DALETH
10F30..10F45 ; Lo # [22] SOGDIAN LETTER ALEPH..SOGDIAN INDEPENDENT SHIN
@@ -2656,7 +2658,7 @@ FFDA..FFDC ; Lo # [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I
30000..3134A ; Lo # [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A
31350..323AF ; Lo # [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF
-# Total code points: 132234
+# Total code points: 132237
# ================================================
@@ -2686,7 +2688,7 @@ FFDA..FFDC ; Lo # [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I
0825..0827 ; Mn # [3] SAMARITAN VOWEL SIGN SHORT A..SAMARITAN VOWEL SIGN U
0829..082D ; Mn # [5] SAMARITAN VOWEL SIGN LONG I..SAMARITAN MARK NEQUDAA
0859..085B ; Mn # [3] MANDAIC AFFRICATION MARK..MANDAIC GEMINATION MARK
-0898..089F ; Mn # [8] ARABIC SMALL HIGH WORD AL-JUZ..ARABIC HALF MADDA OVER MADDA
+0897..089F ; Mn # [9] ARABIC PEPET..ARABIC HALF MADDA OVER MADDA
08CA..08E1 ; Mn # [24] ARABIC SMALL HIGH FARSI YEH..ARABIC SMALL HIGH SIGN SAFHA
08E3..0902 ; Mn # [32] ARABIC TURNED DAMMA BELOW..DEVANAGARI SIGN ANUSVARA
093A ; Mn # DEVANAGARI VOWEL SIGN OE
@@ -3009,7 +3011,7 @@ FE20..FE2F ; Mn # [16] COMBINING LIGATURE LEFT HALF..COMBINING CYRILLIC TITL
1E944..1E94A ; Mn # [7] ADLAM ALIF LENGTHENER..ADLAM NUKTA
E0100..E01EF ; Mn # [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
-# Total code points: 1985
+# Total code points: 1986
# ================================================
diff --git a/unicodetools/data/ucd/dev/extracted/DerivedJoiningGroup.txt b/unicodetools/data/ucd/dev/extracted/DerivedJoiningGroup.txt
index 364847b91..2589107eb 100644
--- a/unicodetools/data/ucd/dev/extracted/DerivedJoiningGroup.txt
+++ b/unicodetools/data/ucd/dev/extracted/DerivedJoiningGroup.txt
@@ -1,5 +1,5 @@
-# DerivedJoiningGroup-15.1.0.txt
-# Date: 2023-01-05, 20:34:37 GMT
+# DerivedJoiningGroup-16.0.0.txt
+# Date: 2023-10-02, 12:16:28 GMT
# © 2023 Unicode®, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
# For terms of use, see https://www.unicode.org/terms_of_use.html
@@ -72,8 +72,9 @@
06EE ; Dal # Lo ARABIC LETTER DAL WITH INVERTED V
0759..075A ; Dal # Lo [2] ARABIC LETTER DAL WITH TWO DOTS VERTICALLY BELOW AND SMALL TAH..ARABIC LETTER DAL WITH INVERTED SMALL V BELOW
08AE ; Dal # Lo ARABIC LETTER DAL WITH THREE DOTS BELOW
+10EC2 ; Dal # Lo ARABIC LETTER DAL WITH TWO DOTS VERTICALLY BELOW
-# Total code points: 15
+# Total code points: 16
# ================================================
@@ -177,8 +178,9 @@
06AC..06AE ; Kaf # Lo [3] ARABIC LETTER KAF WITH DOT ABOVE..ARABIC LETTER KAF WITH THREE DOTS BELOW
077F ; Kaf # Lo ARABIC LETTER KAF WITH TWO DOTS ABOVE
08B4 ; Kaf # Lo ARABIC LETTER KAF WITH DOT BELOW
+10EC4 ; Kaf # Lo ARABIC LETTER KAF WITH TWO DOTS VERTICALLY BELOW
-# Total code points: 6
+# Total code points: 7
# ================================================
@@ -331,8 +333,9 @@
069F ; Tah # Lo ARABIC LETTER TAH WITH THREE DOTS ABOVE
088B..088C ; Tah # Lo [2] ARABIC LETTER TAH WITH DOT BELOW..ARABIC LETTER TAH WITH THREE DOTS BELOW
08A3 ; Tah # Lo ARABIC LETTER TAH WITH TWO DOTS ABOVE
+10EC3 ; Tah # Lo ARABIC LETTER TAH WITH TWO DOTS VERTICALLY BELOW
-# Total code points: 6
+# Total code points: 7
# ================================================
diff --git a/unicodetools/data/ucd/dev/extracted/DerivedJoiningType.txt b/unicodetools/data/ucd/dev/extracted/DerivedJoiningType.txt
index a4e01e7d3..082e7a262 100644
--- a/unicodetools/data/ucd/dev/extracted/DerivedJoiningType.txt
+++ b/unicodetools/data/ucd/dev/extracted/DerivedJoiningType.txt
@@ -1,5 +1,5 @@
-# DerivedJoiningType-15.1.0.txt
-# Date: 2023-01-05, 20:34:38 GMT
+# DerivedJoiningType-16.0.0.txt
+# Date: 2023-10-02, 12:16:29 GMT
# © 2023 Unicode®, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
# For terms of use, see https://www.unicode.org/terms_of_use.html
@@ -95,6 +95,7 @@ A840..A871 ; D # Lo [50] PHAGS-PA LETTER KA..PHAGS-PA SUBJOINED LETTER RA
10BAD..10BAE ; D # No [2] PSALTER PAHLAVI NUMBER TEN..PSALTER PAHLAVI NUMBER TWENTY
10D01..10D21 ; D # Lo [33] HANIFI ROHINGYA LETTER BA..HANIFI ROHINGYA VOWEL O
10D23 ; D # Lo HANIFI ROHINGYA MARK NA KHONNA
+10EC3..10EC4 ; D # Lo [2] ARABIC LETTER TAH WITH TWO DOTS VERTICALLY BELOW..ARABIC LETTER KAF WITH TWO DOTS VERTICALLY BELOW
10F30..10F32 ; D # Lo [3] SOGDIAN LETTER ALEPH..SOGDIAN LETTER GIMEL
10F34..10F44 ; D # Lo [17] SOGDIAN LETTER WAW..SOGDIAN LETTER LESH
10F51..10F53 ; D # No [3] SOGDIAN NUMBER ONE..SOGDIAN NUMBER TWENTY
@@ -110,7 +111,7 @@ A840..A871 ; D # Lo [50] PHAGS-PA LETTER KA..PHAGS-PA SUBJOINED LETTER RA
10FCA ; D # No CHORASMIAN NUMBER TWENTY
1E900..1E943 ; D # L& [68] ADLAM CAPITAL LETTER ALIF..ADLAM SMALL LETTER SHA
-# Total code points: 610
+# Total code points: 612
# ================================================
@@ -173,6 +174,7 @@ A840..A871 ; D # Lo [50] PHAGS-PA LETTER KA..PHAGS-PA SUBJOINED LETTER RA
10B91 ; R # Lo PSALTER PAHLAVI LETTER TAW
10BA9..10BAC ; R # No [4] PSALTER PAHLAVI NUMBER ONE..PSALTER PAHLAVI NUMBER FOUR
10D22 ; R # Lo HANIFI ROHINGYA MARK SAKIN
+10EC2 ; R # Lo ARABIC LETTER DAL WITH TWO DOTS VERTICALLY BELOW
10F33 ; R # Lo SOGDIAN LETTER HE
10F54 ; R # No SOGDIAN NUMBER ONE HUNDRED
10F74..10F75 ; R # Lo [2] OLD UYGHUR LETTER ZAYIN..OLD UYGHUR LETTER FINAL HETH
@@ -182,7 +184,7 @@ A840..A871 ; D # Lo [50] PHAGS-PA LETTER KA..PHAGS-PA SUBJOINED LETTER RA
10FC2..10FC3 ; R # Lo [2] CHORASMIAN LETTER RESH..CHORASMIAN LETTER SHIN
10FC9 ; R # No CHORASMIAN NUMBER TEN
-# Total code points: 152
+# Total code points: 153
# ================================================
@@ -228,7 +230,7 @@ A872 ; L # Lo PHAGS-PA SUPERFIXED LETTER RA
0825..0827 ; T # Mn [3] SAMARITAN VOWEL SIGN SHORT A..SAMARITAN VOWEL SIGN U
0829..082D ; T # Mn [5] SAMARITAN VOWEL SIGN LONG I..SAMARITAN MARK NEQUDAA
0859..085B ; T # Mn [3] MANDAIC AFFRICATION MARK..MANDAIC GEMINATION MARK
-0898..089F ; T # Mn [8] ARABIC SMALL HIGH WORD AL-JUZ..ARABIC HALF MADDA OVER MADDA
+0897..089F ; T # Mn [9] ARABIC PEPET..ARABIC HALF MADDA OVER MADDA
08CA..08E1 ; T # Mn [24] ARABIC SMALL HIGH FARSI YEH..ARABIC SMALL HIGH SIGN SAFHA
08E3..0902 ; T # Mn [32] ARABIC TURNED DAMMA BELOW..DEVANAGARI SIGN ANUSVARA
093A ; T # Mn DEVANAGARI VOWEL SIGN OE
@@ -568,6 +570,6 @@ E0001 ; T # Cf LANGUAGE TAG
E0020..E007F ; T # Cf [96] TAG SPACE..CANCEL TAG
E0100..E01EF ; T # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
-# Total code points: 2150
+# Total code points: 2151
# EOF
diff --git a/unicodetools/data/ucd/dev/extracted/DerivedLineBreak.txt b/unicodetools/data/ucd/dev/extracted/DerivedLineBreak.txt
index edfe51f5c..58095fb3d 100644
--- a/unicodetools/data/ucd/dev/extracted/DerivedLineBreak.txt
+++ b/unicodetools/data/ucd/dev/extracted/DerivedLineBreak.txt
@@ -1,5 +1,5 @@
# DerivedLineBreak-16.0.0.txt
-# Date: 2023-10-03, 19:02:11 GMT
+# Date: 2023-10-12, 18:07:05 GMT
# © 2023 Unicode®, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
# For terms of use, see https://www.unicode.org/terms_of_use.html
@@ -68,8 +68,8 @@ E000..F8FF ; XX # Co [6400] ..
F0000..FFFFD ; XX # Co [65534] ..
100000..10FFFD; XX # Co [65534] ..
-# The above property value applies to 762723 code points not listed here.
-# Total code points: 900191
+# The above property value applies to 762719 code points not listed here.
+# Total code points: 900187
# ================================================
@@ -1299,6 +1299,7 @@ FFED..FFEE ; AL # So [2] HALFWIDTH BLACK SQUARE..HALFWIDTH WHITE CIRCLE
10E60..10E7E ; AL # No [31] RUMI DIGIT ONE..RUMI FRACTION TWO THIRDS
10E80..10EA9 ; AL # Lo [42] YEZIDI LETTER ELIF..YEZIDI LETTER ET
10EB0..10EB1 ; AL # Lo [2] YEZIDI LETTER LAM WITH DOT ABOVE..YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE
+10EC2..10EC4 ; AL # Lo [3] ARABIC LETTER DAL WITH TWO DOTS VERTICALLY BELOW..ARABIC LETTER KAF WITH TWO DOTS VERTICALLY BELOW
10F00..10F1C ; AL # Lo [29] OLD SOGDIAN LETTER ALEPH..OLD SOGDIAN LETTER FINAL TAW WITH VERTICAL TAIL
10F1D..10F26 ; AL # No [10] OLD SOGDIAN NUMBER ONE..OLD SOGDIAN FRACTION ONE HALF
10F27 ; AL # Lo OLD SOGDIAN LIGATURE AYIN-DALETH
@@ -1580,7 +1581,7 @@ FFED..FFEE ; AL # So [2] HALFWIDTH BLACK SQUARE..HALFWIDTH WHITE CIRCLE
1FB00..1FB92 ; AL # So [147] BLOCK SEXTANT-1..UPPER HALF INVERSE MEDIUM SHADE AND LOWER HALF BLOCK
1FB94..1FBCA ; AL # So [55] LEFT HALF INVERSE MEDIUM SHADE AND RIGHT HALF BLOCK..WHITE UP-POINTING CHEVRON
-# Total code points: 21731
+# Total code points: 21734
# ================================================
@@ -1881,7 +1882,7 @@ FE19 ; IN # Po PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS
0825..0827 ; CM # Mn [3] SAMARITAN VOWEL SIGN SHORT A..SAMARITAN VOWEL SIGN U
0829..082D ; CM # Mn [5] SAMARITAN VOWEL SIGN LONG I..SAMARITAN MARK NEQUDAA
0859..085B ; CM # Mn [3] MANDAIC AFFRICATION MARK..MANDAIC GEMINATION MARK
-0898..089F ; CM # Mn [8] ARABIC SMALL HIGH WORD AL-JUZ..ARABIC HALF MADDA OVER MADDA
+0897..089F ; CM # Mn [9] ARABIC PEPET..ARABIC HALF MADDA OVER MADDA
08CA..08E1 ; CM # Mn [24] ARABIC SMALL HIGH FARSI YEH..ARABIC SMALL HIGH SIGN SAFHA
08E3..0902 ; CM # Mn [32] ARABIC TURNED DAMMA BELOW..DEVANAGARI SIGN ANUSVARA
0903 ; CM # Mc DEVANAGARI SIGN VISARGA
@@ -2336,7 +2337,7 @@ E0001 ; CM # Cf LANGUAGE TAG
E0020..E007F ; CM # Cf [96] TAG SPACE..CANCEL TAG
E0100..E01EF ; CM # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
-# Total code points: 2429
+# Total code points: 2430
# ================================================
diff --git a/unicodetools/data/ucd/dev/extracted/DerivedName.txt b/unicodetools/data/ucd/dev/extracted/DerivedName.txt
index ba318df9f..50aab7a0e 100644
--- a/unicodetools/data/ucd/dev/extracted/DerivedName.txt
+++ b/unicodetools/data/ucd/dev/extracted/DerivedName.txt
@@ -1,5 +1,5 @@
# DerivedName-16.0.0.txt
-# Date: 2023-10-03, 19:02:11 GMT
+# Date: 2023-10-12, 18:07:05 GMT
# © 2023 Unicode®, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
# For terms of use, see https://www.unicode.org/terms_of_use.html
@@ -2098,6 +2098,7 @@
088E ; ARABIC VERTICAL TAIL
0890 ; ARABIC POUND MARK ABOVE
0891 ; ARABIC PIASTRE MARK ABOVE
+0897 ; ARABIC PEPET
0898 ; ARABIC SMALL HIGH WORD AL-JUZ
0899 ; ARABIC SMALL LOW WORD ISHMAAM
089A ; ARABIC SMALL LOW WORD IMAALA
@@ -30067,6 +30068,9 @@ FFFD ; REPLACEMENT CHARACTER
10EAD ; YEZIDI HYPHENATION MARK
10EB0 ; YEZIDI LETTER LAM WITH DOT ABOVE
10EB1 ; YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE
+10EC2 ; ARABIC LETTER DAL WITH TWO DOTS VERTICALLY BELOW
+10EC3 ; ARABIC LETTER TAH WITH TWO DOTS VERTICALLY BELOW
+10EC4 ; ARABIC LETTER KAF WITH TWO DOTS VERTICALLY BELOW
10EFD ; ARABIC SMALL LOW WORD SAKTA
10EFE ; ARABIC SMALL LOW WORD QASR
10EFF ; ARABIC SMALL LOW WORD MADDA
@@ -44178,6 +44182,6 @@ E01ED ; VARIATION SELECTOR-254
E01EE ; VARIATION SELECTOR-255
E01EF ; VARIATION SELECTOR-256
-# Total code points: 149815
+# Total code points: 149819
# EOF
diff --git a/unicodetools/src/main/java/org/unicode/props/UcdLineParser.java b/unicodetools/src/main/java/org/unicode/props/UcdLineParser.java
index 712d5e0c0..c17f3c326 100644
--- a/unicodetools/src/main/java/org/unicode/props/UcdLineParser.java
+++ b/unicodetools/src/main/java/org/unicode/props/UcdLineParser.java
@@ -111,6 +111,11 @@ public boolean hasNext() {
return false;
}
line = line2 = rawLines.next();
+ if (line.startsWith("<<<<<<<")
+ || line.startsWith("=======")
+ || line.startsWith(">>>>>>>")) {
+ line2 = "";
+ }
++stats.lineCount;
final int hashPos = line2.indexOf('#');
if (hashPos >= 0) {
diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/MakeUnicodeFiles.java b/unicodetools/src/main/java/org/unicode/text/UCD/MakeUnicodeFiles.java
index e1ff508ad..759361106 100644
--- a/unicodetools/src/main/java/org/unicode/text/UCD/MakeUnicodeFiles.java
+++ b/unicodetools/src/main/java/org/unicode/text/UCD/MakeUnicodeFiles.java
@@ -67,6 +67,7 @@ static class Format {
Map> fileToPropertySet = new TreeMap>();
Map fileToComments = new TreeMap();
Map fileToDirectory = new TreeMap();
+ Map> propertyToOrderedValues = new TreeMap>();
Map> propertyToValueToComments =
new TreeMap>();
Map hackMap = new HashMap();
@@ -110,6 +111,12 @@ public static class PrintStyle {
// Unicode 15.1 and later LineBreak.txt and EastAsianWidth.txt, which are all generated
// in that format by some other tool.
boolean kenFile = false;
+ // Whether the file should be produced in the style of IndicPositionalCategory.txt and
+ // IndicSyllabicCategory.txt, which are both generated in that format by some other
+ // tool.
+ boolean roozbehFile = false;
+ // Whether to separate values of enumerated properties using a line of equal signs.
+ boolean separateValues = true;
boolean hackValues = false;
boolean mergeRanges = true;
String nameStyle = "none";
@@ -138,6 +145,10 @@ String parse(String options) {
interleaveValues = true;
} else if (piece.equals("kenFile")) {
kenFile = true;
+ } else if (piece.equals("roozbehFile")) {
+ roozbehFile = true;
+ } else if (piece.startsWith("separateValues=")) {
+ separateValues = afterEqualsBoolean(piece);
} else if (piece.equals("hackValues")) {
hackValues = true;
} else if (piece.equals("sortNumeric")) {
@@ -301,6 +312,10 @@ private void build() {
}
line = line.trim();
if (line.length() == 0) {
+ if (comments.length() != 0) {
+ // Preserve blank lines between comments.
+ comments += "\n";
+ }
continue;
}
if (DEBUG) {
@@ -321,6 +336,7 @@ private void build() {
comments += line;
} else {
// end of comments, roll up
+ comments = comments.trim();
if (comments.length() != 0) {
if (property != null) {
addValueComments(property, value, comments);
@@ -350,6 +366,10 @@ private void build() {
value = "";
} else if (line.startsWith("Value:")) {
value = lineValue;
+ final var values =
+ propertyToOrderedValues.computeIfAbsent(
+ property, k -> new ArrayList());
+ values.add(value);
} else if (line.startsWith("HackName:")) {
final String regularItem = Utility.getUnskeleton(lineValue, true);
hackMap.put(regularItem, lineValue);
@@ -1152,6 +1172,9 @@ public static void generatePropertyFile(String filename) throws IOException {
filename, Format.theFormat.getPrintStyle(name));
if (!ps.kenFile) {
pwProp.println();
+ if (!ps.separateValues) {
+ pwProp.println();
+ }
pwProp.println(SEPARATOR);
}
final String propComment = Format.theFormat.getValueComments(name, "");
@@ -1161,7 +1184,11 @@ public static void generatePropertyFile(String filename) throws IOException {
pwProp.println(propComment);
} else if (!prop.isType(UnicodeProperty.BINARY_MASK)) {
pwProp.println();
- pwProp.println("# Property:\t" + name);
+ if (ps.roozbehFile) {
+ pwProp.println("# Property: " + name);
+ } else {
+ pwProp.println("# Property:\t" + name);
+ }
}
}
@@ -1182,9 +1209,12 @@ public static void generatePropertyFile(String filename) throws IOException {
v = v + " (" + v2 + ")";
}
}
- pwProp.println();
+ pwProp.println(ps.roozbehFile ? "#" : "");
pwProp.println("# All code points not explicitly listed for " + prop.getName());
- pwProp.println("# have the value " + v + ".");
+ pwProp.println(
+ "# have the value "
+ + v
+ + (ps.roozbehFile && v.equals("NA") ? " (not applicable)." : "."));
}
if (!ps.interleaveValues && prop.isType(UnicodeProperty.BINARY_MASK)) {
@@ -1254,6 +1284,21 @@ private static void writeEnumeratedValues(
temp2.addAll(aliases);
aliases = temp2;
}
+ if (ps.roozbehFile) {
+ aliases.removeIf(alias -> UnicodeProperty.compareNames(alias, ps.skipValue) == 0);
+ if (!Format.theFormat
+ .propertyToOrderedValues
+ .get(prop.getName())
+ .containsAll(aliases)) {
+ final TreeSet missingAliases = new TreeSet(aliases);
+ missingAliases.removeAll(
+ Format.theFormat.propertyToOrderedValues.get(prop.getName()));
+ throw new IllegalArgumentException(
+ "All values must be listed when using roozbehFile; missing "
+ + missingAliases);
+ }
+ aliases = Format.theFormat.propertyToOrderedValues.get(prop.getName());
+ }
if (ps.sortNumeric) {
if (DEBUG) {
System.out.println("Reordering");
@@ -1284,7 +1329,7 @@ private static void writeEnumeratedValues(
final String missing = ps.skipUnassigned != null ? ps.skipUnassigned : ps.skipValue;
if (missing != null && !missing.equals(UCD_Names.NO)) {
- pw.println();
+ pw.println(ps.roozbehFile ? "#" : "");
final String propName = bf.getPropName();
// if (propName == null) propName = "";
// else if (propName.length() != 0) propName = propName + "; ";
@@ -1302,6 +1347,10 @@ private static void writeEnumeratedValues(
writeEnumeratedMissingValues(pw, overallDefault, defaultLbValues);
}
}
+ if (!ps.separateValues) {
+ pw.println();
+ pw.println(SEPARATOR.replace('=', '-'));
+ }
for (final Iterator it = aliases.iterator(); it.hasNext(); ) {
final String value = it.next();
if (DEBUG) {
@@ -1416,9 +1465,13 @@ private static void writeEnumeratedValues(
if (!prop.isType(UnicodeProperty.BINARY_MASK)) {
pw.println();
- pw.println(SEPARATOR);
+ if (ps.separateValues) {
+ pw.println(SEPARATOR);
+ }
if (nonLongValue) {
- pw.println();
+ if (ps.separateValues) {
+ pw.println();
+ }
pw.println("# " + prop.getName() + "=" + value);
}
}
@@ -1442,6 +1495,11 @@ private static void writeEnumeratedValues(
pw.println();
// if (s.size() != 0)
bf.setMergeRanges(ps.mergeRanges);
+ bf.setShowTotal(!ps.roozbehFile);
+ if (ps.roozbehFile) {
+ bf.setRangeBreakSource(
+ ToolUnicodePropertySource.make(Default.ucdVersion()).getProperty("Block"));
+ }
bf.showSetNames(pw, s);
if (DEBUG) {
System.out.println(bf.showSetNames(s));
diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/UCD_Names.java b/unicodetools/src/main/java/org/unicode/text/UCD/UCD_Names.java
index c48269675..0f12b0ffc 100644
--- a/unicodetools/src/main/java/org/unicode/text/UCD/UCD_Names.java
+++ b/unicodetools/src/main/java/org/unicode/text/UCD/UCD_Names.java
@@ -414,19 +414,19 @@ public final class UCD_Names implements UCD_Types {
// Unicode 15
"Kawi",
"Nag_Mundari",
- // A future version of Unicode
- "Sunuwar",
- "Tulu_Tigalari",
- "Kirat_Rai",
- "Todhri",
+ // Unicode 16
"Garay",
"Gurung_Khema",
+ "Kirat_Rai",
"Ol_Onal",
+ "Sunuwar",
+ "Todhri",
+ "Tulu_Tigalari",
// Provisionally assigned
- "Sidetic",
"Chisoi",
- "Tolong_Siki",
+ "Sidetic",
"Tai_Yo",
+ "Tolong_Siki",
};
public static final Relation EXTRA_SCRIPT =
@@ -611,19 +611,19 @@ public final class UCD_Names implements UCD_Types {
// Unicode 15
"Kawi",
"Nagm",
- // A future version of Unicode
- "Qaba",
- "Qabb",
- "Qabc",
- "Qabd",
- "Qabe",
- "Qabf",
- "Qabg",
+ // Unicode 16
+ "Gara",
+ "Gukh",
+ "Krai",
+ "Onao",
+ "Sunu",
+ "Todr",
+ "Tutg",
// Provisionally assigned
- "Qabh",
- "Qabi",
- "Qabj",
- "Qabk",
+ "Chis",
+ "Sidt",
+ "Tayo",
+ "Tols",
};
static final String[] SHORT_AGE = {
diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/UCD_Types.java b/unicodetools/src/main/java/org/unicode/text/UCD/UCD_Types.java
index 6f5a76340..972753c37 100644
--- a/unicodetools/src/main/java/org/unicode/text/UCD/UCD_Types.java
+++ b/unicodetools/src/main/java/org/unicode/text/UCD/UCD_Types.java
@@ -599,20 +599,20 @@ public interface UCD_Types {
// Unicode 15
Kawi = 164,
Nag_Mundari = 165,
- // A future version of Unicode
- Sunuwar = 166,
- Tulu_Tigalari = 167,
+ // Unicode 16
+ Garay = 166,
+ Gurung_Khema = 167,
Kirat_Rai = 168,
- Todhri = 169,
- Garay = 170,
- Gurung_Khema = 171,
- Ol_Onal = 172,
+ Ol_Onal = 169,
+ Sunuwar = 170,
+ Todhri = 171,
+ Tulu_Tigalari = 172,
// Provisionally assigned
- Sidetic = 173,
- Chisoi = 174,
- Tolong_Siki = 175,
- Tai_Yo = 176,
- LIMIT_SCRIPT = Tai_Yo + 1;
+ Chisoi = 173,
+ Sidetic = 174,
+ Tai_Yo = 175,
+ Tolong_Siki = 176,
+ LIMIT_SCRIPT = Tolong_Siki + 1;
// Bidi_Paired_Bracket_Type
public static final byte BPT_N = 0, BPT_O = 1, BPT_C = 2, LIMIT_BPT = 3;
diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/MakeUnicodeFiles.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/MakeUnicodeFiles.txt
index 702c46ca5..db8ebd7b8 100644
--- a/unicodetools/src/main/resources/org/unicode/text/UCD/MakeUnicodeFiles.txt
+++ b/unicodetools/src/main/resources/org/unicode/text/UCD/MakeUnicodeFiles.txt
@@ -908,6 +908,376 @@ Format: kenFile skipValue=Rotated
#
Property: VerticalOrientation
+File: IndicPositionalCategory
+#
+# This file defines the following property:
+#
+# Indic_Positional_Category enumerated property
+#
+# Scope: This property is aimed at the problem of
+# the specification of syllabic structure for Indic scripts.
+# Because dependent vowels (matras), visible viramas, and other
+# characters are placed in notional slots around the consonant (or
+# consonant cluster) core of an Indic syllable, there may be
+# cooccurrence constraints or other interactions. Also, it may be
+# desirable, in cases where more than one such character may occur in
+# sequence, as for example, in a top slot and a bottom slot, to
+# specify preferred orders for spelling. As such, this property
+# is designed primarily to supplement the Indic_Syllabic_Category
+# property.
+#
+# In addition to combining marks associated with Indic scripts, the
+# Indic_Positional_Category has non-trivial values for special signs
+# associated with Indic_Syllabic_Category=Consonant_Prefixed
+# or Indic_Syllabic_Category=Consonant_Preceding_Repha. Those signs
+# have General_Category=Lo, rather than being combining marks.
+# They occur in initial position in syllabic structure. However, when
+# rendered, they appear as marks positioned with respect to another
+# base letter (usually above it). Hence, having an explicit value for
+# Indic_Positional_Category for those signs can be helpful.
+#
+# Note that this property is *not* intended as
+# a prescriptive property regarding display or font design,
+# for a number of reasons. Good font design requires information
+# that is outside the context of a character encoding standard,
+# and is best handled in other venues. For Indic dependent
+# vowels and similar characters, in particular:
+#
+# 1. Matra placement may vary somewhat based on typeface design.
+# 2. Matra placement, even within a single script, may vary
+# somewhat according to historic period or local conventions.
+# 3. Matra placement may be changed by explicit orthographic reform
+# decisions.
+# 4. Matras may ligate in various ways with a consonant (or even
+# other elements of a syllable) instead of occurring in a
+# discrete location.
+# 5. Matra display may be contextually determined. This is
+# notable, for example, in the Tamil script, where the shape
+# and placement of -u and -uu vowels depends strongly on
+# which consonant they adjoin.
+#
+# Format:
+# Field 0 Unicode code point value or range of code point values
+# Field 1 Indic_Positional_Category property value
+#
+# Field 1 is followed by a comment field, starting with the number sign '#',
+# which shows the General_Category property value, the Unicode character name
+# or names, and, in lines with ranges of code points, the code point count in
+# square brackets.
+#
+# The scripts assessed as containing dependent vowels or similar characters
+# in the structural sense used for the Indic_Positional_Category are the
+# following:
+#
+# Ahom, Balinese, Batak, Bengali, Bhaiksuki, Brahmi, Buginese, Buhid,
+# Chakma, Cham, Devanagari, Dives Akuru, Dogra, Grantha, Gujarati,
+# Gunjala Gondi, Gurmukhi, Hanunoo, Javanese, Kaithi, Kannada, Kawi,
+# Kayah Li, Kharoshthi, Khmer, Khojki, Khudawadi, Lao, Lepcha, Limbu,
+# Makasar, Malayalam, Marchen, Masaram Gondi, Meetei Mayek, Modi,
+# Myanmar, Nandinagari, Newa, New Tai Lue, Oriya, Rejang, Saurashtra,
+# Sharada, Siddham, Sinhala, Soyombo, Sundanese, Syloti Nagri,
+# Tagalog, Tagbanwa, Tai Tham, Tai Viet, Takri, Tamil, Telugu, Thai,
+# Tibetan, Tirhuta, and Zanabazar Square.
+#
+# All characters for all other scripts not in that list
+# take the default value for this property.
+#
+# See IndicSyllabicCategory.txt for a slightly more extended
+# list of Indic scripts, including those which do not have
+# positional characters. Currently, those additional
+# Indic scripts without positional characters are
+# Multani, Phags-pa, and Tai Le.
+#
+# Notes:
+#
+# 1. The following characters are all assigned the positional category Right,
+# but may have different positions in some cases:
+# * U+0BC1 TAMIL VOWEL SIGN U and U+0BC2 TAMIL VOWEL SIGN UU have
+# contextually variable placement in Tamil.
+# * U+0D41 MALAYALAM VOWEL SIGN U and U+0D42 MALAYALAM VOWEL SIGN UU form
+# complex ligatures with consonants in older Malayalam orthography.
+# * U+11341 GRANTHA VOWEL SIGN U and U+11342 GRANTHA VOWEL SIGN UU have
+# contextually variable placement in Grantha.
+# * U+11440 NEWA VOWEL SIGN O and U+11441 NEWA VOWEL SIGN AU have contextually
+# variable placement in Newa.
+#
+# 2. The following characters are all assigned the positional category Top,
+# but may have different positions in some cases:
+# * U+1143E NEWA VOWEL SIGN E and U+1143F NEWA VOWEL SIGN AI have contextually
+# variable placement in Newa.
+#
+# 3. The following characters are all assigned the positional category Bottom,
+# but may have different positions in some cases:
+# * U+102F MYANMAR VOWEL SIGN U and U+1030 MYANMAR VOWEL SIGN UU have
+# contextually variable placement in Myanmar.
+# * U+1A69 TAI THAM VOWEL SIGN U and U+1A6A TAI THAM VOWEL SIGN UU have
+# contextually variable placement in Tai Tham.
+#
+# 4. The following character is assigned the positional category Left, but
+# may have different positions in different styles:
+# * U+119D2 NANDINAGARI VOWEL SIGN I has stylistically variable placement
+# in Nandinagari.
+Property: Indic_Positional_Category
+Format: roozbehFile separateValues=false valueStyle=short skipValue=NA
+Value: Right
+Value: Left
+Value: Visual_Order_Left
+
+# These are dependent vowels that occur to the left of the consonant
+# letter in a syllable, but which occur in scripts using the visual order
+# model, instead of the logical order model. Because of the different
+# model, these left-side vowels occur first in the backing store (before
+# the consonant letter) and are not reordered during text rendering.
+#
+# [Derivation: Logical_Order_Exception=Yes]
+Value: Left_And_Right
+Value: Top
+Value: Bottom
+Value: Top_And_Bottom
+Value: Top_And_Right
+Value: Top_And_Left
+Value: Top_And_Left_And_Right
+Value: Bottom_And_Right
+Value: Bottom_And_Left
+Value: Top_And_Bottom_And_Right
+Value: Top_And_Bottom_And_Left
+Value: Overstruck
+
+File: IndicSyllabicCategory
+#
+# This file defines the following property:
+#
+# Indic_Syllabic_Category enumerated property
+#
+# Scope: This property is aimed at two general problem
+# areas involving the analysis and processing of Indic scripts:
+#
+# 1. Specification of syllabic structure.
+# 2. Specification of segmentation rules.
+#
+# Both of these problem areas may benefit from having defined subtypes
+# of Indic script characters which are relevant to how Indic
+# syllables (or aksaras) are constructed. Note that rules for
+# syllabic structure in Indic scripts may differ significantly
+# from how phonological syllables are defined.
+#
+# Format:
+# Field 0 Unicode code point value or range of code point values
+# Field 1 Indic_Syllabic_Category property value
+#
+# Field 1 is followed by a comment field, starting with the number sign '#',
+# which shows the General_Category property value, the Unicode character name
+# or names, and, in lines with ranges of code points, the code point count in
+# square brackets.
+#
+# The scripts assessed as Indic in the structural sense used for the
+# Indic_Syllabic_Category are the following:
+#
+# Ahom, Balinese, Batak, Bengali, Bhaiksuki, Brahmi, Buginese, Buhid,
+# Chakma, Cham, Devanagari, Dives Akuru, Dogra, Grantha, Gujarati,
+# Gunjala Gondi, Gurmukhi, Hanunoo, Javanese, Kaithi, Kannada, Kawi,
+# Kayah Li, Kharoshthi, Khmer, Khojki, Khudawadi, Lao, Lepcha, Limbu,
+# Mahajani, Makasar, Malayalam, Marchen, Masaram Gondi, Meetei Mayek,
+# Modi, Multani, Myanmar, Nandinagari, Newa, New Tai Lue, Oriya,
+# Phags-pa, Rejang, Saurashtra, Sharada, Siddham, Sinhala, Soyombo,
+# Sundanese, Syloti Nagri, Tagalog, Tagbanwa, Tai Le, Tai Tham,
+# Tai Viet, Takri, Tamil, Telugu, Thai, Tibetan, Tirhuta, and
+# Zanabazar Square.
+#
+# All characters for all other scripts not in that list
+# take the default value for this property, unless they
+# are individually listed in this data file.
+#
+Property: Indic_Syllabic_Category
+Format: roozbehFile valueStyle=short skipValue=Other
+Value: Bindu
+# Bindu/Anusvara (nasalization or -n)
+
+# [Not derivable]
+Value: Visarga
+# Visarga (-h)
+# Excludes letters for jihvamuliya and upadhmaniya, which are
+# related, but structured somewhat differently.
+
+# [Not derivable]
+Value: Avagraha
+# Avagraha (elision of initial a- in sandhi)
+
+# [Not derivable]
+Value: Nukta
+# Nukta (diacritic for borrowed consonants or other consonant
+# modifications). Note that while the resulting sound is typically a
+# consonant, the base letter a nukta follows may be an independent
+# vowel. For example, is used to transcribe ARABIC LETTER
+# AIN.
+
+# [Not derivable]
+Value: Virama
+# Virama (killing of inherent vowel in consonant sequence
+# or consonant stacker)
+# Only includes characters that can act both as visible killer viramas
+# and consonant stackers. Separate property values exist for characters
+# that can only act as pure killers or only as consonant stackers.
+
+# [Derivation: (ccc=9) - (InSC=Pure_Killer) - (InSC=Invisible_Stacker)
+# - (InSC=Number_Joiner) - 2D7F]
+Value: Pure_Killer
+# Pure killer (killing of inherent vowel in consonant sequence,
+# with no consonant stacking behavior)
+
+# [Not derivable]
+Value: Invisible_Stacker
+# Invisible stacker (invisible consonant stacker virama).
+#
+# Note that in some scripts, such as Kharoshthi and Masaram Gondi, an invisible
+# stacker may have a second function, changing the shape and/or location of the
+# consonant preceding it, even when there is no consonant following the
+# invisible stacker.
+
+# [Not derivable]
+Value: Vowel_Independent
+# Independent Vowels (contrasted with matras)
+
+# [Not derivable]
+Value: Vowel_Dependent
+# Dependent Vowels (contrasted with independent vowels and/or with
+# complex placement). Known as matras in Indic scripts. Also
+# includes vowel modifiers that follow dependent (and sometimes
+# independent) vowels.
+
+# [Not derivable]
+Value: Vowel
+# (Other) Vowels (reanalyzed as ordinary alphabetic letters or marks)
+
+# [Not derivable]
+Value: Consonant_Placeholder
+# Consonant Placeholder
+# This includes generic placeholders used for
+# Indic script layout (NBSP and dotted circle), as well as a few script-
+# specific vowel-holder characters which are not technically
+# consonants, but serve instead as bases for placement of vowel marks.
+
+# [Not derivable]
+Value: Consonant
+# Consonant (ordinary abugida consonants, with inherent vowels)
+
+# [Not derivable]
+Value: Consonant_Dead
+# Dead Consonant (special consonant with killed vowel)
+
+# [Not derivable]
+Value: Consonant_With_Stacker
+# Consonants that may make stacked ligatures with the next consonant
+# without the use of a virama
+
+# [Not derivable]
+Value: Consonant_Prefixed
+# Cluster-initial consonants
+
+# [Not derivable]
+Value: Consonant_Preceding_Repha
+# Repha Form of RA (reanalyzed in some scripts), when preceding the main
+# consonant.
+
+# [Not derivable]
+Value: Consonant_Initial_Postfixed
+# Consonants that succeed the main consonant in character sequences, but are
+# pronounced before it.
+
+# [Not derivable]
+Value: Consonant_Succeeding_Repha
+# Repha Form of RA (reanalyzed in some scripts), when succeeding the main
+# consonant.
+
+# [Not derivable]
+Value: Consonant_Subjoined
+# Subjoined Consonant (C2 form subtending a base consonant in Tibetan, etc.)
+
+# [Not derivable]
+Value: Consonant_Medial
+# Medial Consonant (medial liquid, occurring in clusters)
+
+# [Not derivable]
+Value: Consonant_Final
+# Final Consonant (special final forms which do not take vowels)
+
+# [Not derivable]
+Value: Consonant_Head_Letter
+# Head Letter (Tibetan)
+
+# [Not derivable]
+Value: Modifying_Letter
+# Reanalyzed letters not participating in the abugida structure, but
+# serving to modify the sound of an adjacent vowel or consonant.
+# Note that this is not the same as General_Category=Modifier_Letter.
+
+# [Not derivable]
+Value: Tone_Letter
+# Tone Letter (spacing lexical tone mark with status as a letter)
+
+# [Not derivable]
+Value: Tone_Mark
+# Tone Mark (nonspacing or spacing lexical tone mark)
+
+# [Not derivable]
+Value: Gemination_Mark
+# Gemination Mark (doubling of the preceding or following consonant)
+#
+# U+0A71 GURMUKHI ADDAK precedes the consonant it geminates, while the
+# others follow the consonant they geminate.
+
+# [Not derivable]
+Value: Cantillation_Mark
+# Cantillation Mark (recitation marks, such as svara markers for the Samaveda)
+
+# [Not derivable]
+Value: Register_Shifter
+# Register Shifter (shifts register for consonants, akin to a tone mark)
+
+# [Not derivable]
+Value: Syllable_Modifier
+# Syllable Modifier (miscellaneous combining characters that modify
+# something in the orthographic syllable they succeed or appear in)
+
+# [Not derivable]
+Value: Consonant_Killer
+# Consonant Killer (signifies that the previous consonant or consonants are
+# not pronounced)
+
+# [Not derivable]
+Value: Non_Joiner
+# Non_Joiner (Zero Width Non-Joiner)
+
+# [Not derivable]
+Value: Joiner
+# Joiner (Zero Width Joiner)
+
+# [Not derivable]
+Value: Number_Joiner
+# Number_Joiner (forms ligatures between numbers for multiplication)
+
+# [Not derivable]
+Value: Number
+# Number (can be used as vowel-holders like consonant placeholders)
+# Note: A number may even hold subjoined consonants which may in turn
+# have been formed using a virama or a stacker, e.g. the sequence
+# where THAI THAM LETTER LOW TA is subjoined to
+# TAI THAM THAM DIGIT THREE using an invisible stacker.
+
+# [Not derivable]
+Value: Brahmi_Joining_Number
+# Brahmi Joining Number (may be joined by a Number_Joiner of the same
+# script, e.g. in Brahmi)
+#
+# Note: These are different from Numbers, in the way that there is no known
+# evidence of Brahmi Joining Numbers taking vowels or subjoined consonants.
+# Until such evidence is found, implementations may assume that Brahmi
+# Joining Numbers only participate in shaping with other Brahmi Joining
+# Numbers.
+
+# [Not derivable]
+
File: UnicodeData
Property: SPECIAL
diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt
index 8cb202e68..8ee8762b3 100644
--- a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt
+++ b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt
@@ -530,7 +530,10 @@ Show [\u20b9]
# exceptions. Should such exceptions arise, they can be added to the definition of
# $nonAlphabeticBindus to avoid a failure on this test.
Let $nonAlphabeticBindus = []
-[\p{InSc=Bindu} - $nonAlphabeticBindus - \p{Alphabetic}] = []
+[\p{InSc=Bindu} - \p{Alphabetic}] = $nonAlphabeticBindus
+
+Let $nonAlphabeticDependentVowels = [\N{ORIYA SIGN OVERLINE}\N{THAI CHARACTER MAITAIKHU}\N{LIMBU SIGN KEMPHRENG}\N{SHARADA VOWEL MODIFIER MARK}\N{SHARADA EXTRA SHORT VOWEL MARK}]
+[\p{InSC=Vowel_Dependent} - \p{Alphabetic}] = $nonAlphabeticDependentVowels
##########################
# LineBreak property