From ed4ecf0b2f81eb8f39d16b8fc6a0b30488d6316a Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Mon, 2 Oct 2023 16:50:37 +0200 Subject: [PATCH 1/7] First attempt at generating InPC --- .../org/unicode/text/UCD/MakeUnicodeFiles.txt | 120 ++++++++++++++++++ 1 file changed, 120 insertions(+) diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/MakeUnicodeFiles.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/MakeUnicodeFiles.txt index 702c46ca5..645a490e2 100644 --- a/unicodetools/src/main/resources/org/unicode/text/UCD/MakeUnicodeFiles.txt +++ b/unicodetools/src/main/resources/org/unicode/text/UCD/MakeUnicodeFiles.txt @@ -908,6 +908,126 @@ Format: kenFile skipValue=Rotated # Property: VerticalOrientation +File: IndicPositionalCategory +# IndicPositionalCategory-15.1.0.txt +# Date: 2023-01-05 +# © 2023 Unicode®, Inc. +# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. +# For terms of use, see https://www.unicode.org/terms_of_use.html +# +# For documentation, see UAX #44: Unicode Character Database, +# at https://www.unicode.org/reports/tr44/ +# +# This file defines the following property: +# +# Indic_Positional_Category enumerated property +# +# Scope: This property is aimed at the problem of +# the specification of syllabic structure for Indic scripts. +# Because dependent vowels (matras), visible viramas, and other +# characters are placed in notional slots around the consonant (or +# consonant cluster) core of an Indic syllable, there may be +# cooccurrence constraints or other interactions. Also, it may be +# desirable, in cases where more than one such character may occur in +# sequence, as for example, in a top slot and a bottom slot, to +# specify preferred orders for spelling. As such, this property +# is designed primarily to supplement the Indic_Syllabic_Category +# property. +# +# In addition to combining marks associated with Indic scripts, the +# Indic_Positional_Category has non-trivial values for special signs +# associated with Indic_Syllabic_Category=Consonant_Prefixed +# or Indic_Syllabic_Category=Consonant_Preceding_Repha. Those signs +# have General_Category=Lo, rather than being combining marks. +# They occur in initial position in syllabic structure. However, when +# rendered, they appear as marks positioned with respect to another +# base letter (usually above it). Hence, having an explicit value for +# Indic_Positional_Category for those signs can be helpful. +# +# Note that this property is *not* intended as +# a prescriptive property regarding display or font design, +# for a number of reasons. Good font design requires information +# that is outside the context of a character encoding standard, +# and is best handled in other venues. For Indic dependent +# vowels and similar characters, in particular: +# +# 1. Matra placement may vary somewhat based on typeface design. +# 2. Matra placement, even within a single script, may vary +# somewhat according to historic period or local conventions. +# 3. Matra placement may be changed by explicit orthographic reform +# decisions. +# 4. Matras may ligate in various ways with a consonant (or even +# other elements of a syllable) instead of occurring in a +# discrete location. +# 5. Matra display may be contextually determined. This is +# notable, for example, in the Tamil script, where the shape +# and placement of -u and -uu vowels depends strongly on +# which consonant they adjoin. +# +# Format: +# Field 0 Unicode code point value or range of code point values +# Field 1 Indic_Positional_Category property value +# +# Field 1 is followed by a comment field, starting with the number sign '#', +# which shows the General_Category property value, the Unicode character name +# or names, and, in lines with ranges of code points, the code point count in +# square brackets. +# +# The scripts assessed as containing dependent vowels or similar characters +# in the structural sense used for the Indic_Positional_Category are the +# following: +# +# Ahom, Balinese, Batak, Bengali, Bhaiksuki, Brahmi, Buginese, Buhid, +# Chakma, Cham, Devanagari, Dives Akuru, Dogra, Grantha, Gujarati, +# Gunjala Gondi, Gurmukhi, Hanunoo, Javanese, Kaithi, Kannada, Kawi, +# Kayah Li, Kharoshthi, Khmer, Khojki, Khudawadi, Lao, Lepcha, Limbu, +# Makasar, Malayalam, Marchen, Masaram Gondi, Meetei Mayek, Modi, +# Myanmar, Nandinagari, Newa, New Tai Lue, Oriya, Rejang, Saurashtra, +# Sharada, Siddham, Sinhala, Soyombo, Sundanese, Syloti Nagri, +# Tagalog, Tagbanwa, Tai Tham, Tai Viet, Takri, Tamil, Telugu, Thai, +# Tibetan, Tirhuta, and Zanabazar Square. +# +# All characters for all other scripts not in that list +# take the default value for this property. +# +# See IndicSyllabicCategory.txt for a slightly more extended +# list of Indic scripts, including those which do not have +# positional characters. Currently, those additional +# Indic scripts without positional characters are +# Multani, Phags-pa, and Tai Le. +# +# Notes: +# +# 1. The following characters are all assigned the positional category Right, +# but may have different positions in some cases: +# * U+0BC1 TAMIL VOWEL SIGN U and U+0BC2 TAMIL VOWEL SIGN UU have +# contextually variable placement in Tamil. +# * U+0D41 MALAYALAM VOWEL SIGN U and U+0D42 MALAYALAM VOWEL SIGN UU form +# complex ligatures with consonants in older Malayalam orthography. +# * U+11341 GRANTHA VOWEL SIGN U and U+11342 GRANTHA VOWEL SIGN UU have +# contextually variable placement in Grantha. +# * U+11440 NEWA VOWEL SIGN O and U+11441 NEWA VOWEL SIGN AU have contextually +# variable placement in Newa. +# +# 2. The following characters are all assigned the positional category Top, +# but may have different positions in some cases: +# * U+1143E NEWA VOWEL SIGN E and U+1143F NEWA VOWEL SIGN AI have contextually +# variable placement in Newa. +# +# 3. The following characters are all assigned the positional category Bottom, +# but may have different positions in some cases: +# * U+102F MYANMAR VOWEL SIGN U and U+1030 MYANMAR VOWEL SIGN UU have +# contextually variable placement in Myanmar. +# * U+1A69 TAI THAM VOWEL SIGN U and U+1A6A TAI THAM VOWEL SIGN UU have +# contextually variable placement in Tai Tham. +# +# 4. The following character is assigned the positional category Left, but +# may have different positions in different styles: +# * U+119D2 NANDINAGARI VOWEL SIGN I has stylistically variable placement +# in Nandinagari. +Property: Indic_Positional_Category +Format: skipValue=NA + File: UnicodeData Property: SPECIAL From 8349834417948514bfca81d3c5a27c846ed40d54 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Mon, 2 Oct 2023 17:32:58 +0200 Subject: [PATCH 2/7] Some progress towards roozbehFile --- .../unicode/text/UCD/MakeUnicodeFiles.java | 18 ++++++++++ .../org/unicode/text/UCD/MakeUnicodeFiles.txt | 35 +++++++++++++------ 2 files changed, 43 insertions(+), 10 deletions(-) diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/MakeUnicodeFiles.java b/unicodetools/src/main/java/org/unicode/text/UCD/MakeUnicodeFiles.java index e1ff508ad..afb906a02 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/MakeUnicodeFiles.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/MakeUnicodeFiles.java @@ -67,6 +67,8 @@ static class Format { Map> fileToPropertySet = new TreeMap>(); Map fileToComments = new TreeMap(); Map fileToDirectory = new TreeMap(); + Map> propertyToOrderedValues = + new TreeMap>(); Map> propertyToValueToComments = new TreeMap>(); Map hackMap = new HashMap(); @@ -110,6 +112,10 @@ public static class PrintStyle { // Unicode 15.1 and later LineBreak.txt and EastAsianWidth.txt, which are all generated // in that format by some other tool. boolean kenFile = false; + // Whether the file should be produced in the style of IndicPositionalCategory.txt and + // IndicSyllabicCategory.txt, which are both generated in that format by some other + // tool. + boolean roozbehFile = false; boolean hackValues = false; boolean mergeRanges = true; String nameStyle = "none"; @@ -138,6 +144,8 @@ String parse(String options) { interleaveValues = true; } else if (piece.equals("kenFile")) { kenFile = true; + } else if (piece.equals("roozbehFile")) { + roozbehFile = true; } else if (piece.equals("hackValues")) { hackValues = true; } else if (piece.equals("sortNumeric")) { @@ -350,6 +358,12 @@ private void build() { value = ""; } else if (line.startsWith("Value:")) { value = lineValue; + var values = propertyToOrderedValues.get(property); + if (values == null) { + values = new ArrayList(); + propertyToOrderedValues.put(property, values); + } + values.add(value); } else if (line.startsWith("HackName:")) { final String regularItem = Utility.getUnskeleton(lineValue, true); hackMap.put(regularItem, lineValue); @@ -1254,6 +1268,10 @@ private static void writeEnumeratedValues( temp2.addAll(aliases); aliases = temp2; } + if (ps.roozbehFile) { + System.out.println(Format.theFormat.propertyToOrderedValues); + aliases = Format.theFormat.propertyToOrderedValues.get(prop.getName()); + } if (ps.sortNumeric) { if (DEBUG) { System.out.println("Reordering"); diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/MakeUnicodeFiles.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/MakeUnicodeFiles.txt index 645a490e2..6902d6b2c 100644 --- a/unicodetools/src/main/resources/org/unicode/text/UCD/MakeUnicodeFiles.txt +++ b/unicodetools/src/main/resources/org/unicode/text/UCD/MakeUnicodeFiles.txt @@ -909,15 +909,6 @@ Format: kenFile skipValue=Rotated Property: VerticalOrientation File: IndicPositionalCategory -# IndicPositionalCategory-15.1.0.txt -# Date: 2023-01-05 -# © 2023 Unicode®, Inc. -# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. -# For terms of use, see https://www.unicode.org/terms_of_use.html -# -# For documentation, see UAX #44: Unicode Character Database, -# at https://www.unicode.org/reports/tr44/ -# # This file defines the following property: # # Indic_Positional_Category enumerated property @@ -1025,8 +1016,32 @@ File: IndicPositionalCategory # may have different positions in different styles: # * U+119D2 NANDINAGARI VOWEL SIGN I has stylistically variable placement # in Nandinagari. + Property: Indic_Positional_Category -Format: skipValue=NA +Format: roozbehFile skipValue=NA +Value: Right +Value: Left +Value: Visual_Order_Left + +# These are dependent vowels that occur to the left of the consonant +# letter in a syllable, but which occur in scripts using the visual order +# model, instead of the logical order model. Because of the different +# model, these left-side vowels occur first in the backing store (before +# the consonant letter) and are not reordered during text rendering. +# +# [Derivation: Logical_Order_Exception=Yes] +Value: Left_And_Right +Value: Top +Value: Bottom +Value: Top_And_Bottom +Value: Top_And_Right +Value: Top_And_Left +Value: Top_And_Left_And_Right +Value: Bottom_And_Right +Value: Bottom_And_Left +Value: Top_And_Bottom_And_Right +Value: Top_And_Bottom_And_Left +Value: Overstruck File: UnicodeData Property: SPECIAL From 52bbc737086f3afab7d661d6c354b713e00aed51 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Mon, 2 Oct 2023 17:39:18 +0200 Subject: [PATCH 3/7] --- --- .../main/java/org/unicode/text/UCD/MakeUnicodeFiles.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/MakeUnicodeFiles.java b/unicodetools/src/main/java/org/unicode/text/UCD/MakeUnicodeFiles.java index afb906a02..981849162 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/MakeUnicodeFiles.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/MakeUnicodeFiles.java @@ -1434,7 +1434,11 @@ private static void writeEnumeratedValues( if (!prop.isType(UnicodeProperty.BINARY_MASK)) { pw.println(); - pw.println(SEPARATOR); + if (ps.roozbehFile) { + pw.println(SEPARATOR.replace('=', '-')); + } else { + pw.println(SEPARATOR); + } if (nonLongValue) { pw.println(); pw.println("# " + prop.getName() + "=" + value); From cd39981f7766d285380e211f72f7800cb37298c4 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Mon, 2 Oct 2023 18:10:09 +0200 Subject: [PATCH 4/7] more roozbehifying --- .../org/unicode/text/UCD/MakeUnicodeFiles.java | 18 +++++++++++++----- .../org/unicode/text/UCD/MakeUnicodeFiles.txt | 3 ++- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/MakeUnicodeFiles.java b/unicodetools/src/main/java/org/unicode/text/UCD/MakeUnicodeFiles.java index 981849162..3b9c8b0b8 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/MakeUnicodeFiles.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/MakeUnicodeFiles.java @@ -714,7 +714,11 @@ private static void writeUnihan(String directory) throws IOException { pw.println(); pw.println(SEPARATOR); pw.println(); - pw.println("# Property:\t" + propName); + if (ps.roozbehFile) { + pw.println("# Property: " + propName); + } else { + pw.println("# Property:\t" + propName); + } final UnicodeMap map = props.get(propName); @@ -1320,6 +1324,9 @@ private static void writeEnumeratedValues( writeEnumeratedMissingValues(pw, overallDefault, defaultLbValues); } } + if (ps.roozbehFile) { + pw.println(SEPARATOR.replace('=', '-')); + } for (final Iterator it = aliases.iterator(); it.hasNext(); ) { final String value = it.next(); if (DEBUG) { @@ -1434,13 +1441,13 @@ private static void writeEnumeratedValues( if (!prop.isType(UnicodeProperty.BINARY_MASK)) { pw.println(); - if (ps.roozbehFile) { - pw.println(SEPARATOR.replace('=', '-')); - } else { + if (!ps.roozbehFile) { pw.println(SEPARATOR); } if (nonLongValue) { - pw.println(); + if (!ps.roozbehFile) { + pw.println(); + } pw.println("# " + prop.getName() + "=" + value); } } @@ -1464,6 +1471,7 @@ private static void writeEnumeratedValues( pw.println(); // if (s.size() != 0) bf.setMergeRanges(ps.mergeRanges); + bf.setShowTotal(!ps.roozbehFile); bf.showSetNames(pw, s); if (DEBUG) { System.out.println(bf.showSetNames(s)); diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/MakeUnicodeFiles.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/MakeUnicodeFiles.txt index 6902d6b2c..3eaec2cdd 100644 --- a/unicodetools/src/main/resources/org/unicode/text/UCD/MakeUnicodeFiles.txt +++ b/unicodetools/src/main/resources/org/unicode/text/UCD/MakeUnicodeFiles.txt @@ -909,6 +909,7 @@ Format: kenFile skipValue=Rotated Property: VerticalOrientation File: IndicPositionalCategory +# # This file defines the following property: # # Indic_Positional_Category enumerated property @@ -1018,7 +1019,7 @@ File: IndicPositionalCategory # in Nandinagari. Property: Indic_Positional_Category -Format: roozbehFile skipValue=NA +Format: roozbehFile valueStyle=short skipValue=NA Value: Right Value: Left Value: Visual_Order_Left From fb6bf4e50b8174411c2bba9ff3606f0d7bf15869 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Mon, 2 Oct 2023 20:06:10 +0200 Subject: [PATCH 5/7] As good as it will get. --- .../data/ucd/dev/IndicPositionalCategory.txt | 8 +++---- .../unicode/text/UCD/MakeUnicodeFiles.java | 22 +++++++++++-------- .../org/unicode/text/UCD/MakeUnicodeFiles.txt | 1 - 3 files changed, 17 insertions(+), 14 deletions(-) diff --git a/unicodetools/data/ucd/dev/IndicPositionalCategory.txt b/unicodetools/data/ucd/dev/IndicPositionalCategory.txt index a7c5aef60..0381f42cb 100644 --- a/unicodetools/data/ucd/dev/IndicPositionalCategory.txt +++ b/unicodetools/data/ucd/dev/IndicPositionalCategory.txt @@ -1,11 +1,11 @@ -# IndicPositionalCategory-15.1.0.txt -# Date: 2023-01-05 +# IndicPositionalCategory-16.0.0.txt +# Date: 2023-10-02, 18:04:25 GMT # © 2023 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see https://www.unicode.org/terms_of_use.html # -# For documentation, see UAX #44: Unicode Character Database, -# at https://www.unicode.org/reports/tr44/ +# Unicode Character Database +# For documentation, see https://www.unicode.org/reports/tr44/ # # This file defines the following property: # diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/MakeUnicodeFiles.java b/unicodetools/src/main/java/org/unicode/text/UCD/MakeUnicodeFiles.java index 3b9c8b0b8..6fbe8432c 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/MakeUnicodeFiles.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/MakeUnicodeFiles.java @@ -714,11 +714,7 @@ private static void writeUnihan(String directory) throws IOException { pw.println(); pw.println(SEPARATOR); pw.println(); - if (ps.roozbehFile) { - pw.println("# Property: " + propName); - } else { - pw.println("# Property:\t" + propName); - } + pw.println("# Property:\t" + propName); final UnicodeMap map = props.get(propName); @@ -1170,6 +1166,9 @@ public static void generatePropertyFile(String filename) throws IOException { filename, Format.theFormat.getPrintStyle(name)); if (!ps.kenFile) { pwProp.println(); + if (ps.roozbehFile) { + pwProp.println(); + } pwProp.println(SEPARATOR); } final String propComment = Format.theFormat.getValueComments(name, ""); @@ -1179,7 +1178,11 @@ public static void generatePropertyFile(String filename) throws IOException { pwProp.println(propComment); } else if (!prop.isType(UnicodeProperty.BINARY_MASK)) { pwProp.println(); - pwProp.println("# Property:\t" + name); + if (ps.roozbehFile) { + pwProp.println("# Property: " + name); + } else { + pwProp.println("# Property:\t" + name); + } } } @@ -1200,9 +1203,9 @@ public static void generatePropertyFile(String filename) throws IOException { v = v + " (" + v2 + ")"; } } - pwProp.println(); + pwProp.println(ps.roozbehFile ? "#" : ""); pwProp.println("# All code points not explicitly listed for " + prop.getName()); - pwProp.println("# have the value " + v + "."); + pwProp.println("# have the value " + v + (ps.roozbehFile && v.equals("NA") ? " (not applicable)." : ".")); } if (!ps.interleaveValues && prop.isType(UnicodeProperty.BINARY_MASK)) { @@ -1306,7 +1309,7 @@ private static void writeEnumeratedValues( final String missing = ps.skipUnassigned != null ? ps.skipUnassigned : ps.skipValue; if (missing != null && !missing.equals(UCD_Names.NO)) { - pw.println(); + pw.println(ps.roozbehFile ? "#" : ""); final String propName = bf.getPropName(); // if (propName == null) propName = ""; // else if (propName.length() != 0) propName = propName + "; "; @@ -1325,6 +1328,7 @@ private static void writeEnumeratedValues( } } if (ps.roozbehFile) { + pw.println(); pw.println(SEPARATOR.replace('=', '-')); } for (final Iterator it = aliases.iterator(); it.hasNext(); ) { diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/MakeUnicodeFiles.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/MakeUnicodeFiles.txt index 3eaec2cdd..a2fc7229a 100644 --- a/unicodetools/src/main/resources/org/unicode/text/UCD/MakeUnicodeFiles.txt +++ b/unicodetools/src/main/resources/org/unicode/text/UCD/MakeUnicodeFiles.txt @@ -1017,7 +1017,6 @@ File: IndicPositionalCategory # may have different positions in different styles: # * U+119D2 NANDINAGARI VOWEL SIGN I has stylistically variable placement # in Nandinagari. - Property: Indic_Positional_Category Format: roozbehFile valueStyle=short skipValue=NA Value: Right From d6b73d940daa97673c5d739ad3584b6e56c2430e Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Tue, 3 Oct 2023 01:03:58 +0200 Subject: [PATCH 6/7] Seems to work well enough --- .../data/ucd/dev/IndicPositionalCategory.txt | 2 +- .../data/ucd/dev/IndicSyllabicCategory.txt | 10 +- .../unicode/text/UCD/MakeUnicodeFiles.java | 21 +- .../org/unicode/text/UCD/MakeUnicodeFiles.txt | 237 +++++++++++++++++- 4 files changed, 258 insertions(+), 12 deletions(-) diff --git a/unicodetools/data/ucd/dev/IndicPositionalCategory.txt b/unicodetools/data/ucd/dev/IndicPositionalCategory.txt index 0381f42cb..9b5aabfa0 100644 --- a/unicodetools/data/ucd/dev/IndicPositionalCategory.txt +++ b/unicodetools/data/ucd/dev/IndicPositionalCategory.txt @@ -1,5 +1,5 @@ # IndicPositionalCategory-16.0.0.txt -# Date: 2023-10-02, 18:04:25 GMT +# Date: 2023-10-02, 22:58:33 GMT # © 2023 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see https://www.unicode.org/terms_of_use.html diff --git a/unicodetools/data/ucd/dev/IndicSyllabicCategory.txt b/unicodetools/data/ucd/dev/IndicSyllabicCategory.txt index f2623b471..5de0d7554 100644 --- a/unicodetools/data/ucd/dev/IndicSyllabicCategory.txt +++ b/unicodetools/data/ucd/dev/IndicSyllabicCategory.txt @@ -1,11 +1,11 @@ -# IndicSyllabicCategory-15.1.0.txt -# Date: 2023-01-05 +# IndicSyllabicCategory-16.0.0.txt +# Date: 2023-10-02, 22:58:33 GMT # © 2023 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see https://www.unicode.org/terms_of_use.html # -# For documentation, see UAX #44: Unicode Character Database, -# at https://www.unicode.org/reports/tr44/ +# Unicode Character Database +# For documentation, see https://www.unicode.org/reports/tr44/ # # This file defines the following property: # @@ -1335,7 +1335,7 @@ ABF0..ABF9 ; Number # Nd [10] MEETEI MAYEK DIGIT ZERO..MEETEI MAYEK DIGIT NI # script, e.g. in Brahmi) # # Note: These are different from Numbers, in the way that there is no known -# evidence of Brahmi Joining Numbers taking vowels or subjoined consonants. +# evidence of Brahmi Joining Numbers taking vowels or subjoined consonants. # Until such evidence is found, implementations may assume that Brahmi # Joining Numbers only participate in shaping with other Brahmi Joining # Numbers. diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/MakeUnicodeFiles.java b/unicodetools/src/main/java/org/unicode/text/UCD/MakeUnicodeFiles.java index 6fbe8432c..d24e283c9 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/MakeUnicodeFiles.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/MakeUnicodeFiles.java @@ -116,6 +116,8 @@ public static class PrintStyle { // IndicSyllabicCategory.txt, which are both generated in that format by some other // tool. boolean roozbehFile = false; + // Whether to separate values of enumerated properties using a line of equal signs. + boolean separateValues = true; boolean hackValues = false; boolean mergeRanges = true; String nameStyle = "none"; @@ -146,6 +148,8 @@ String parse(String options) { kenFile = true; } else if (piece.equals("roozbehFile")) { roozbehFile = true; + } else if (piece.startsWith("separateValues=")) { + separateValues = afterEqualsBoolean(piece); } else if (piece.equals("hackValues")) { hackValues = true; } else if (piece.equals("sortNumeric")) { @@ -309,6 +313,10 @@ private void build() { } line = line.trim(); if (line.length() == 0) { + if (comments.length() != 0) { + // Preserve blank lines between comments. + comments += "\n"; + } continue; } if (DEBUG) { @@ -329,6 +337,7 @@ private void build() { comments += line; } else { // end of comments, roll up + comments = comments.trim(); if (comments.length() != 0) { if (property != null) { addValueComments(property, value, comments); @@ -1166,7 +1175,7 @@ public static void generatePropertyFile(String filename) throws IOException { filename, Format.theFormat.getPrintStyle(name)); if (!ps.kenFile) { pwProp.println(); - if (ps.roozbehFile) { + if (!ps.separateValues) { pwProp.println(); } pwProp.println(SEPARATOR); @@ -1276,7 +1285,6 @@ private static void writeEnumeratedValues( aliases = temp2; } if (ps.roozbehFile) { - System.out.println(Format.theFormat.propertyToOrderedValues); aliases = Format.theFormat.propertyToOrderedValues.get(prop.getName()); } if (ps.sortNumeric) { @@ -1327,7 +1335,7 @@ private static void writeEnumeratedValues( writeEnumeratedMissingValues(pw, overallDefault, defaultLbValues); } } - if (ps.roozbehFile) { + if (!ps.separateValues) { pw.println(); pw.println(SEPARATOR.replace('=', '-')); } @@ -1445,11 +1453,11 @@ private static void writeEnumeratedValues( if (!prop.isType(UnicodeProperty.BINARY_MASK)) { pw.println(); - if (!ps.roozbehFile) { + if (ps.separateValues) { pw.println(SEPARATOR); } if (nonLongValue) { - if (!ps.roozbehFile) { + if (ps.separateValues) { pw.println(); } pw.println("# " + prop.getName() + "=" + value); @@ -1476,6 +1484,9 @@ private static void writeEnumeratedValues( // if (s.size() != 0) bf.setMergeRanges(ps.mergeRanges); bf.setShowTotal(!ps.roozbehFile); + if (ps.roozbehFile) { + bf.setRangeBreakSource(ToolUnicodePropertySource.make(Default.ucdVersion()).getProperty("Block")); + } bf.showSetNames(pw, s); if (DEBUG) { System.out.println(bf.showSetNames(s)); diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/MakeUnicodeFiles.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/MakeUnicodeFiles.txt index a2fc7229a..db8ebd7b8 100644 --- a/unicodetools/src/main/resources/org/unicode/text/UCD/MakeUnicodeFiles.txt +++ b/unicodetools/src/main/resources/org/unicode/text/UCD/MakeUnicodeFiles.txt @@ -1018,7 +1018,7 @@ File: IndicPositionalCategory # * U+119D2 NANDINAGARI VOWEL SIGN I has stylistically variable placement # in Nandinagari. Property: Indic_Positional_Category -Format: roozbehFile valueStyle=short skipValue=NA +Format: roozbehFile separateValues=false valueStyle=short skipValue=NA Value: Right Value: Left Value: Visual_Order_Left @@ -1043,6 +1043,241 @@ Value: Top_And_Bottom_And_Right Value: Top_And_Bottom_And_Left Value: Overstruck +File: IndicSyllabicCategory +# +# This file defines the following property: +# +# Indic_Syllabic_Category enumerated property +# +# Scope: This property is aimed at two general problem +# areas involving the analysis and processing of Indic scripts: +# +# 1. Specification of syllabic structure. +# 2. Specification of segmentation rules. +# +# Both of these problem areas may benefit from having defined subtypes +# of Indic script characters which are relevant to how Indic +# syllables (or aksaras) are constructed. Note that rules for +# syllabic structure in Indic scripts may differ significantly +# from how phonological syllables are defined. +# +# Format: +# Field 0 Unicode code point value or range of code point values +# Field 1 Indic_Syllabic_Category property value +# +# Field 1 is followed by a comment field, starting with the number sign '#', +# which shows the General_Category property value, the Unicode character name +# or names, and, in lines with ranges of code points, the code point count in +# square brackets. +# +# The scripts assessed as Indic in the structural sense used for the +# Indic_Syllabic_Category are the following: +# +# Ahom, Balinese, Batak, Bengali, Bhaiksuki, Brahmi, Buginese, Buhid, +# Chakma, Cham, Devanagari, Dives Akuru, Dogra, Grantha, Gujarati, +# Gunjala Gondi, Gurmukhi, Hanunoo, Javanese, Kaithi, Kannada, Kawi, +# Kayah Li, Kharoshthi, Khmer, Khojki, Khudawadi, Lao, Lepcha, Limbu, +# Mahajani, Makasar, Malayalam, Marchen, Masaram Gondi, Meetei Mayek, +# Modi, Multani, Myanmar, Nandinagari, Newa, New Tai Lue, Oriya, +# Phags-pa, Rejang, Saurashtra, Sharada, Siddham, Sinhala, Soyombo, +# Sundanese, Syloti Nagri, Tagalog, Tagbanwa, Tai Le, Tai Tham, +# Tai Viet, Takri, Tamil, Telugu, Thai, Tibetan, Tirhuta, and +# Zanabazar Square. +# +# All characters for all other scripts not in that list +# take the default value for this property, unless they +# are individually listed in this data file. +# +Property: Indic_Syllabic_Category +Format: roozbehFile valueStyle=short skipValue=Other +Value: Bindu +# Bindu/Anusvara (nasalization or -n) + +# [Not derivable] +Value: Visarga +# Visarga (-h) +# Excludes letters for jihvamuliya and upadhmaniya, which are +# related, but structured somewhat differently. + +# [Not derivable] +Value: Avagraha +# Avagraha (elision of initial a- in sandhi) + +# [Not derivable] +Value: Nukta +# Nukta (diacritic for borrowed consonants or other consonant +# modifications). Note that while the resulting sound is typically a +# consonant, the base letter a nukta follows may be an independent +# vowel. For example, is used to transcribe ARABIC LETTER +# AIN. + +# [Not derivable] +Value: Virama +# Virama (killing of inherent vowel in consonant sequence +# or consonant stacker) +# Only includes characters that can act both as visible killer viramas +# and consonant stackers. Separate property values exist for characters +# that can only act as pure killers or only as consonant stackers. + +# [Derivation: (ccc=9) - (InSC=Pure_Killer) - (InSC=Invisible_Stacker) +# - (InSC=Number_Joiner) - 2D7F] +Value: Pure_Killer +# Pure killer (killing of inherent vowel in consonant sequence, +# with no consonant stacking behavior) + +# [Not derivable] +Value: Invisible_Stacker +# Invisible stacker (invisible consonant stacker virama). +# +# Note that in some scripts, such as Kharoshthi and Masaram Gondi, an invisible +# stacker may have a second function, changing the shape and/or location of the +# consonant preceding it, even when there is no consonant following the +# invisible stacker. + +# [Not derivable] +Value: Vowel_Independent +# Independent Vowels (contrasted with matras) + +# [Not derivable] +Value: Vowel_Dependent +# Dependent Vowels (contrasted with independent vowels and/or with +# complex placement). Known as matras in Indic scripts. Also +# includes vowel modifiers that follow dependent (and sometimes +# independent) vowels. + +# [Not derivable] +Value: Vowel +# (Other) Vowels (reanalyzed as ordinary alphabetic letters or marks) + +# [Not derivable] +Value: Consonant_Placeholder +# Consonant Placeholder +# This includes generic placeholders used for +# Indic script layout (NBSP and dotted circle), as well as a few script- +# specific vowel-holder characters which are not technically +# consonants, but serve instead as bases for placement of vowel marks. + +# [Not derivable] +Value: Consonant +# Consonant (ordinary abugida consonants, with inherent vowels) + +# [Not derivable] +Value: Consonant_Dead +# Dead Consonant (special consonant with killed vowel) + +# [Not derivable] +Value: Consonant_With_Stacker +# Consonants that may make stacked ligatures with the next consonant +# without the use of a virama + +# [Not derivable] +Value: Consonant_Prefixed +# Cluster-initial consonants + +# [Not derivable] +Value: Consonant_Preceding_Repha +# Repha Form of RA (reanalyzed in some scripts), when preceding the main +# consonant. + +# [Not derivable] +Value: Consonant_Initial_Postfixed +# Consonants that succeed the main consonant in character sequences, but are +# pronounced before it. + +# [Not derivable] +Value: Consonant_Succeeding_Repha +# Repha Form of RA (reanalyzed in some scripts), when succeeding the main +# consonant. + +# [Not derivable] +Value: Consonant_Subjoined +# Subjoined Consonant (C2 form subtending a base consonant in Tibetan, etc.) + +# [Not derivable] +Value: Consonant_Medial +# Medial Consonant (medial liquid, occurring in clusters) + +# [Not derivable] +Value: Consonant_Final +# Final Consonant (special final forms which do not take vowels) + +# [Not derivable] +Value: Consonant_Head_Letter +# Head Letter (Tibetan) + +# [Not derivable] +Value: Modifying_Letter +# Reanalyzed letters not participating in the abugida structure, but +# serving to modify the sound of an adjacent vowel or consonant. +# Note that this is not the same as General_Category=Modifier_Letter. + +# [Not derivable] +Value: Tone_Letter +# Tone Letter (spacing lexical tone mark with status as a letter) + +# [Not derivable] +Value: Tone_Mark +# Tone Mark (nonspacing or spacing lexical tone mark) + +# [Not derivable] +Value: Gemination_Mark +# Gemination Mark (doubling of the preceding or following consonant) +# +# U+0A71 GURMUKHI ADDAK precedes the consonant it geminates, while the +# others follow the consonant they geminate. + +# [Not derivable] +Value: Cantillation_Mark +# Cantillation Mark (recitation marks, such as svara markers for the Samaveda) + +# [Not derivable] +Value: Register_Shifter +# Register Shifter (shifts register for consonants, akin to a tone mark) + +# [Not derivable] +Value: Syllable_Modifier +# Syllable Modifier (miscellaneous combining characters that modify +# something in the orthographic syllable they succeed or appear in) + +# [Not derivable] +Value: Consonant_Killer +# Consonant Killer (signifies that the previous consonant or consonants are +# not pronounced) + +# [Not derivable] +Value: Non_Joiner +# Non_Joiner (Zero Width Non-Joiner) + +# [Not derivable] +Value: Joiner +# Joiner (Zero Width Joiner) + +# [Not derivable] +Value: Number_Joiner +# Number_Joiner (forms ligatures between numbers for multiplication) + +# [Not derivable] +Value: Number +# Number (can be used as vowel-holders like consonant placeholders) +# Note: A number may even hold subjoined consonants which may in turn +# have been formed using a virama or a stacker, e.g. the sequence +# where THAI THAM LETTER LOW TA is subjoined to +# TAI THAM THAM DIGIT THREE using an invisible stacker. + +# [Not derivable] +Value: Brahmi_Joining_Number +# Brahmi Joining Number (may be joined by a Number_Joiner of the same +# script, e.g. in Brahmi) +# +# Note: These are different from Numbers, in the way that there is no known +# evidence of Brahmi Joining Numbers taking vowels or subjoined consonants. +# Until such evidence is found, implementations may assume that Brahmi +# Joining Numbers only participate in shaping with other Brahmi Joining +# Numbers. + +# [Not derivable] + File: UnicodeData Property: SPECIAL From e0e2fc26cd5342de9c033f8729f4d7c182d335b3 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Tue, 3 Oct 2023 01:06:07 +0200 Subject: [PATCH 7/7] spotless --- .../java/org/unicode/text/UCD/MakeUnicodeFiles.java | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/MakeUnicodeFiles.java b/unicodetools/src/main/java/org/unicode/text/UCD/MakeUnicodeFiles.java index d24e283c9..de6a5dea9 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/MakeUnicodeFiles.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/MakeUnicodeFiles.java @@ -67,8 +67,7 @@ static class Format { Map> fileToPropertySet = new TreeMap>(); Map fileToComments = new TreeMap(); Map fileToDirectory = new TreeMap(); - Map> propertyToOrderedValues = - new TreeMap>(); + Map> propertyToOrderedValues = new TreeMap>(); Map> propertyToValueToComments = new TreeMap>(); Map hackMap = new HashMap(); @@ -1214,7 +1213,10 @@ public static void generatePropertyFile(String filename) throws IOException { } pwProp.println(ps.roozbehFile ? "#" : ""); pwProp.println("# All code points not explicitly listed for " + prop.getName()); - pwProp.println("# have the value " + v + (ps.roozbehFile && v.equals("NA") ? " (not applicable)." : ".")); + pwProp.println( + "# have the value " + + v + + (ps.roozbehFile && v.equals("NA") ? " (not applicable)." : ".")); } if (!ps.interleaveValues && prop.isType(UnicodeProperty.BINARY_MASK)) { @@ -1485,7 +1487,8 @@ private static void writeEnumeratedValues( bf.setMergeRanges(ps.mergeRanges); bf.setShowTotal(!ps.roozbehFile); if (ps.roozbehFile) { - bf.setRangeBreakSource(ToolUnicodePropertySource.make(Default.ucdVersion()).getProperty("Block")); + bf.setRangeBreakSource( + ToolUnicodePropertySource.make(Default.ucdVersion()).getProperty("Block")); } bf.showSetNames(pw, s); if (DEBUG) {