Skip to content

Commit

Permalink
Semicolons and Sentence Breaks (#812)
Browse files Browse the repository at this point in the history
  • Loading branch information
josh-hadley authored May 9, 2024
1 parent 3e8adc4 commit 74ff6e9
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 25 deletions.
12 changes: 9 additions & 3 deletions unicodetools/data/ucd/dev/PropList.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# PropList-16.0.0.txt
# Date: 2024-05-06, 12:17:26 GMT
# Date: 2024-05-08, 03:40:06 GMT
# © 2024 Unicode®, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
# For terms of use and license, see https://www.unicode.org/terms_of_use.html
Expand Down Expand Up @@ -160,6 +160,7 @@ FF63 ; Quotation_Mark # Pe HALFWIDTH RIGHT CORNER BRACKET
2024 ; Terminal_Punctuation # Po ONE DOT LEADER
203C..203D ; Terminal_Punctuation # Po [2] DOUBLE EXCLAMATION MARK..INTERROBANG
2047..2049 ; Terminal_Punctuation # Po [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK
2CF9..2CFB ; Terminal_Punctuation # Po [3] COPTIC OLD NUBIAN FULL STOP..COPTIC OLD NUBIAN INDIRECT QUESTION MARK
2E2E ; Terminal_Punctuation # Po REVERSED QUESTION MARK
2E3C ; Terminal_Punctuation # Po STENOGRAPHIC FULL STOP
2E41 ; Terminal_Punctuation # Po REVERSED COMMA
Expand All @@ -178,6 +179,8 @@ AA5D..AA5F ; Terminal_Punctuation # Po [3] CHAM PUNCTUATION DANDA..CHAM PUN
AADF ; Terminal_Punctuation # Po TAI VIET SYMBOL KOI KOI
AAF0..AAF1 ; Terminal_Punctuation # Po [2] MEETEI MAYEK CHEIKHAN..MEETEI MAYEK AHANG KHUDAM
ABEB ; Terminal_Punctuation # Po MEETEI MAYEK CHEIKHEI
FE12 ; Terminal_Punctuation # Po PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC FULL STOP
FE15..FE16 ; Terminal_Punctuation # Po [2] PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK..PRESENTATION FORM FOR VERTICAL QUESTION MARK
FE50..FE52 ; Terminal_Punctuation # Po [3] SMALL COMMA..SMALL FULL STOP
FE54..FE57 ; Terminal_Punctuation # Po [4] SMALL SEMICOLON..SMALL EXCLAMATION MARK
FF01 ; Terminal_Punctuation # Po FULLWIDTH EXCLAMATION MARK
Expand Down Expand Up @@ -231,7 +234,7 @@ FF64 ; Terminal_Punctuation # Po HALFWIDTH IDEOGRAPHIC COMMA
1BC9F ; Terminal_Punctuation # Po DUPLOYAN PUNCTUATION CHINOOK FULL STOP
1DA87..1DA8A ; Terminal_Punctuation # Po [4] SIGNWRITING COMMA..SIGNWRITING COLON

# Total code points: 285
# Total code points: 291

# ================================================

Expand Down Expand Up @@ -1543,6 +1546,7 @@ FF65 ; Other_ID_Continue # Po HALFWIDTH KATAKANA MIDDLE DOT
2024 ; Sentence_Terminal # Po ONE DOT LEADER
203C..203D ; Sentence_Terminal # Po [2] DOUBLE EXCLAMATION MARK..INTERROBANG
2047..2049 ; Sentence_Terminal # Po [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK
2CF9..2CFB ; Sentence_Terminal # Po [3] COPTIC OLD NUBIAN FULL STOP..COPTIC OLD NUBIAN INDIRECT QUESTION MARK
2E2E ; Sentence_Terminal # Po REVERSED QUESTION MARK
2E3C ; Sentence_Terminal # Po STENOGRAPHIC FULL STOP
2E53..2E54 ; Sentence_Terminal # Po [2] MEDIEVAL EXCLAMATION MARK..MEDIEVAL QUESTION MARK
Expand All @@ -1558,6 +1562,8 @@ A9C8..A9C9 ; Sentence_Terminal # Po [2] JAVANESE PADA LINGSA..JAVANESE PADA
AA5D..AA5F ; Sentence_Terminal # Po [3] CHAM PUNCTUATION DANDA..CHAM PUNCTUATION TRIPLE DANDA
AAF0..AAF1 ; Sentence_Terminal # Po [2] MEETEI MAYEK CHEIKHAN..MEETEI MAYEK AHANG KHUDAM
ABEB ; Sentence_Terminal # Po MEETEI MAYEK CHEIKHEI
FE12 ; Sentence_Terminal # Po PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC FULL STOP
FE15..FE16 ; Sentence_Terminal # Po [2] PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK..PRESENTATION FORM FOR VERTICAL QUESTION MARK
FE52 ; Sentence_Terminal # Po SMALL FULL STOP
FE56..FE57 ; Sentence_Terminal # Po [2] SMALL QUESTION MARK..SMALL EXCLAMATION MARK
FF01 ; Sentence_Terminal # Po FULLWIDTH EXCLAMATION MARK
Expand Down Expand Up @@ -1598,7 +1604,7 @@ FF61 ; Sentence_Terminal # Po HALFWIDTH IDEOGRAPHIC FULL STOP
1BC9F ; Sentence_Terminal # Po DUPLOYAN PUNCTUATION CHINOOK FULL STOP
1DA88 ; Sentence_Terminal # Po SIGNWRITING FULL STOP

# Total code points: 164
# Total code points: 170

# ================================================

Expand Down
24 changes: 17 additions & 7 deletions unicodetools/data/ucd/dev/auxiliary/SentenceBreakProperty.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# SentenceBreakProperty-16.0.0.txt
# Date: 2024-05-06, 12:18:03 GMT
# Date: 2024-05-08, 04:00:01 GMT
# © 2024 Unicode®, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
# For terms of use and license, see https://www.unicode.org/terms_of_use.html
Expand Down Expand Up @@ -2710,6 +2710,7 @@ FF0E ; ATerm # Po FULLWIDTH FULL STOP
1C7E..1C7F ; STerm # Po [2] OL CHIKI PUNCTUATION MUCAAD..OL CHIKI PUNCTUATION DOUBLE MUCAAD
203C..203D ; STerm # Po [2] DOUBLE EXCLAMATION MARK..INTERROBANG
2047..2049 ; STerm # Po [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK
2CF9..2CFB ; STerm # Po [3] COPTIC OLD NUBIAN FULL STOP..COPTIC OLD NUBIAN INDIRECT QUESTION MARK
2E2E ; STerm # Po REVERSED QUESTION MARK
2E3C ; STerm # Po STENOGRAPHIC FULL STOP
2E53..2E54 ; STerm # Po [2] MEDIEVAL EXCLAMATION MARK..MEDIEVAL QUESTION MARK
Expand All @@ -2725,6 +2726,8 @@ A9C8..A9C9 ; STerm # Po [2] JAVANESE PADA LINGSA..JAVANESE PADA LUNGSI
AA5D..AA5F ; STerm # Po [3] CHAM PUNCTUATION DANDA..CHAM PUNCTUATION TRIPLE DANDA
AAF0..AAF1 ; STerm # Po [2] MEETEI MAYEK CHEIKHAN..MEETEI MAYEK AHANG KHUDAM
ABEB ; STerm # Po MEETEI MAYEK CHEIKHEI
FE12 ; STerm # Po PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC FULL STOP
FE15..FE16 ; STerm # Po [2] PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK..PRESENTATION FORM FOR VERTICAL QUESTION MARK
FE56..FE57 ; STerm # Po [2] SMALL QUESTION MARK..SMALL EXCLAMATION MARK
FF01 ; STerm # Po FULLWIDTH EXCLAMATION MARK
FF1F ; STerm # Po FULLWIDTH QUESTION MARK
Expand Down Expand Up @@ -2763,7 +2766,7 @@ FF61 ; STerm # Po HALFWIDTH IDEOGRAPHIC FULL STOP
1BC9F ; STerm # Po DUPLOYAN PUNCTUATION CHINOOK FULL STOP
1DA88 ; STerm # Po SIGNWRITING FULL STOP

# Total code points: 160
# Total code points: 166

# ================================================

Expand Down Expand Up @@ -2957,26 +2960,33 @@ FF63 ; Close # Pe HALFWIDTH RIGHT CORNER BRACKET

002C ; SContinue # Po COMMA
002D ; SContinue # Pd HYPHEN-MINUS
003A ; SContinue # Po COLON
003A..003B ; SContinue # Po [2] COLON..SEMICOLON
037E ; SContinue # Po GREEK QUESTION MARK
055D ; SContinue # Po ARMENIAN COMMA
060C..060D ; SContinue # Po [2] ARABIC COMMA..ARABIC DATE SEPARATOR
061B ; SContinue # Po ARABIC SEMICOLON
07F8 ; SContinue # Po NKO COMMA
1364 ; SContinue # Po ETHIOPIC SEMICOLON
1802 ; SContinue # Po MONGOLIAN COMMA
1808 ; SContinue # Po MONGOLIAN MANCHU COMMA
2013..2014 ; SContinue # Pd [2] EN DASH..EM DASH
204F ; SContinue # Po REVERSED SEMICOLON
2E35 ; SContinue # Po TURNED SEMICOLON
3001 ; SContinue # Po IDEOGRAPHIC COMMA
A6F6 ; SContinue # Po BAMUM SEMICOLON
FE10..FE11 ; SContinue # Po [2] PRESENTATION FORM FOR VERTICAL COMMA..PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC COMMA
FE13 ; SContinue # Po PRESENTATION FORM FOR VERTICAL COLON
FE13..FE14 ; SContinue # Po [2] PRESENTATION FORM FOR VERTICAL COLON..PRESENTATION FORM FOR VERTICAL SEMICOLON
FE31..FE32 ; SContinue # Pd [2] PRESENTATION FORM FOR VERTICAL EM DASH..PRESENTATION FORM FOR VERTICAL EN DASH
FE50..FE51 ; SContinue # Po [2] SMALL COMMA..SMALL IDEOGRAPHIC COMMA
FE55 ; SContinue # Po SMALL COLON
FE54..FE55 ; SContinue # Po [2] SMALL SEMICOLON..SMALL COLON
FE58 ; SContinue # Pd SMALL EM DASH
FE63 ; SContinue # Pd SMALL HYPHEN-MINUS
FF0C ; SContinue # Po FULLWIDTH COMMA
FF0D ; SContinue # Pd FULLWIDTH HYPHEN-MINUS
FF1A ; SContinue # Po FULLWIDTH COLON
FF1A..FF1B ; SContinue # Po [2] FULLWIDTH COLON..FULLWIDTH SEMICOLON
FF64 ; SContinue # Po HALFWIDTH IDEOGRAPHIC COMMA
1DA89 ; SContinue # Po SIGNWRITING SEMICOLON

# Total code points: 26
# Total code points: 37

# EOF
Original file line number Diff line number Diff line change
Expand Up @@ -1464,6 +1464,7 @@ public int getMaxWidth(boolean isShort) {
unicodeMap.putAll(
getProperty("STerm")
.getSet(UCD_Names.YES)
.addAll(new UnicodeSet("[\\u2CF9\\u2CFA\\u2CFB\\uFE12\\uFE15\\uFE16]"))
.removeAll(unicodeMap.keySet("ATerm")),
"STerm");
unicodeMap.putAll(
Expand All @@ -1476,21 +1477,32 @@ public int getMaxWidth(boolean isShort) {
"Close");
unicodeMap.putAll(
new UnicodeSet(
"[\\u002C\\u3001\\uFE10\\uFE11\\uFF0C"
+ "\\uFE50\\uFF64\\uFE51\\uFE51\\u055D\\u060C\\u060D\\u07F8\\u1802\\u1808"
+ // new
// from
// L2/08-029
"\\u003A\\uFE13\\uFF1A"
+ "\\uFE55"
+ // new from L2/08-029
// "\\u003B\\uFE14\\uFF1B" +
"\\u2014\\uFE31\\u002D\\uFF0D"
+ "\\u2013\\uFE32\\uFE58\\uFE63"
+ // new
// from
// L2/08-029
"]"),
"[\\u002C\\u3001\\uFE10\\uFE11\\uFF0C"
+ "\\uFE50\\uFF64\\uFE51\\uFE51\\u055D\\u060C\\u060D\\u07F8\\u1802\\u1808"
+ // new
// from
// L2/08-029
"\\u003A\\uFE13\\uFF1A"
+ "\\uFE55"
+ // new from L2/08-029
// "\\u003B\\uFE14\\uFF1B" +
"\\u2014\\uFE31\\u002D\\uFF0D"
+ "\\u2013\\uFE32\\uFE58\\uFE63"
+ // new
// from
// L2/08-029
"]")
.add(0x003B)
.add(0x037E)
.add(0x061B)
.add(0x1364)
.add(0x204F)
.add(0x2E35)
.add(0xA6F6)
.add(0xFE14)
.add(0xFE54)
.add(0xFF1B)
.add(0x1DA89),
"SContinue");
// unicodeMap.putAll(graphemeExtend, "Other"); // to verify that none
// of the above touch it.
Expand Down

0 comments on commit 74ff6e9

Please sign in to comment.