Skip to content

Commit

Permalink
Unicode: fix the extended grapheme cluster algorithm
Browse files Browse the repository at this point in the history
  • Loading branch information
handsomematt committed Jul 14, 2023
1 parent 52156f8 commit 3670fee
Show file tree
Hide file tree
Showing 7 changed files with 6,573 additions and 6,986 deletions.
12,660 changes: 6,385 additions & 6,275 deletions valve/qtbase/src/corelib/text/qunicodetables.cpp

Large diffs are not rendered by default.

5 changes: 1 addition & 4 deletions valve/qtbase/src/corelib/text/qunicodetables_p.h
Original file line number Diff line number Diff line change
Expand Up @@ -118,10 +118,7 @@ enum GraphemeBreakClass {
GraphemeBreak_T,
GraphemeBreak_LV,
GraphemeBreak_LVT,
Graphemebreak_E_Base,
Graphemebreak_E_Modifier,
Graphemebreak_Glue_After_Zwj,
Graphemebreak_E_Base_GAZ,
GraphemeBreak_Extended_Pictographic,

NumGraphemeBreakClasses
};
Expand Down
181 changes: 123 additions & 58 deletions valve/qtbase/src/corelib/text/qunicodetools.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@

#include "qharfbuzz_p.h"

#include <limits.h>

#define FLAG(x) (1 << (x))

QT_BEGIN_NAMESPACE
Expand All @@ -55,53 +57,98 @@ namespace QUnicodeTools {
// -----------------------------------------------------------------------------------------------------
//
// The text boundaries determination algorithm.
// See http://www.unicode.org/reports/tr29/tr29-31.html
// See https://www.unicode.org/reports/tr29/tr29-37.html
//
// -----------------------------------------------------------------------------------------------------

namespace GB {

/*
* Most grapheme break rules can be implemented table driven, but rules GB10, GB12 and GB13 need a bit
* of special treatment.
*/
enum State : uchar {
Break,
Inside,
GB10,
GB10_2,
GB10_3,
GB13, // also covers GB12
// This table is indexed by the grapheme break classes of two
// (adjacent) code points.
// The class of the first code point selects an entry.
// If the entry's bit at position second_cp_class is set
// (in other words: if entry & (1u << second_cp_class) is non-zero)
// then there is NO grapheme break between the two code points.

using GBTableEntryType = quint16;

// Check that we have enough bits in the table (in case
// NumGraphemeBreakClasses grows too much).
static_assert(sizeof(GBTableEntryType) * CHAR_BIT >= QUnicodeTables::NumGraphemeBreakClasses,
"Internal error: increase the size in bits of GBTableEntryType");

// GB9, GB9a
static const GBTableEntryType Extend_SpacingMark_ZWJ =
FLAG(QUnicodeTables::GraphemeBreak_Extend)
| FLAG(QUnicodeTables::GraphemeBreak_SpacingMark)
| FLAG(QUnicodeTables::GraphemeBreak_ZWJ);

static const GBTableEntryType HardBreak = 0u;

static const GBTableEntryType breakTable[QUnicodeTables::NumGraphemeBreakClasses] = {
Extend_SpacingMark_ZWJ, // Any
FLAG(QUnicodeTables::GraphemeBreak_LF), // CR
HardBreak, // LF
HardBreak, // Control
Extend_SpacingMark_ZWJ, // Extend
Extend_SpacingMark_ZWJ, // ZWJ
Extend_SpacingMark_ZWJ, // RegionalIndicator
(Extend_SpacingMark_ZWJ
| FLAG(QUnicodeTables::GraphemeBreak_Any)
| FLAG(QUnicodeTables::GraphemeBreak_Prepend)
| FLAG(QUnicodeTables::GraphemeBreak_L)
| FLAG(QUnicodeTables::GraphemeBreak_V)
| FLAG(QUnicodeTables::GraphemeBreak_T)
| FLAG(QUnicodeTables::GraphemeBreak_LV)
| FLAG(QUnicodeTables::GraphemeBreak_LVT)
| FLAG(QUnicodeTables::GraphemeBreak_RegionalIndicator)
| FLAG(QUnicodeTables::GraphemeBreak_Extended_Pictographic)
), // Prepend
Extend_SpacingMark_ZWJ, // SpacingMark
(Extend_SpacingMark_ZWJ
| FLAG(QUnicodeTables::GraphemeBreak_L)
| FLAG(QUnicodeTables::GraphemeBreak_V)
| FLAG(QUnicodeTables::GraphemeBreak_LV)
| FLAG(QUnicodeTables::GraphemeBreak_LVT)
), // L
(Extend_SpacingMark_ZWJ
| FLAG(QUnicodeTables::GraphemeBreak_V)
| FLAG(QUnicodeTables::GraphemeBreak_T)
), // V
(Extend_SpacingMark_ZWJ
| FLAG(QUnicodeTables::GraphemeBreak_T)
), // T
(Extend_SpacingMark_ZWJ
| FLAG(QUnicodeTables::GraphemeBreak_V)
| FLAG(QUnicodeTables::GraphemeBreak_T)
), // LV
(Extend_SpacingMark_ZWJ
| FLAG(QUnicodeTables::GraphemeBreak_T)
), // LVT
Extend_SpacingMark_ZWJ // Extended_Pictographic
};

static const State breakTable[QUnicodeTables::NumGraphemeBreakClasses][QUnicodeTables::NumGraphemeBreakClasses] = {
// Any CR LF Control Extend ZWJ RI Prepend S-Mark L V T LV LVT E_B E_M GAZ EBG
{ Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Any
{ Break , Break , Inside, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // CR
{ Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // LF
{ Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Control
{ Break , Break , Break , Break , GB10_2, Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break , GB10_3, Break , Break }, // Extend
{ Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break , Break , Inside, Inside }, // ZWJ
{ Break , Break , Break , Break , Inside, Inside, GB13 , Break , Inside, Break , Break , Break , Break , Break , Break , Break , Break , Break }, // RegionalIndicator
{ Inside, Break , Break , Break , Inside, Inside, Inside, Inside, Inside, Inside, Inside, Inside, Inside, Inside, Inside, Inside, Inside, Inside }, // Prepend
{ Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break , Break , Break , Break }, // SpacingMark
{ Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Inside, Inside, Break , Inside, Inside, Break , Break , Break , Break }, // L
{ Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Break , Inside, Inside, Break , Break , Break , Break , Break , Break }, // V
{ Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break }, // T
{ Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Break , Inside, Inside, Break , Break , Break , Break , Break , Break }, // LV
{ Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break }, // LVT
{ Break , Break , Break , Break , GB10 , Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break , Inside, Break , Break }, // E_B
{ Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break , Break , Break , Break }, // E_M
{ Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break , Break , Break , Break }, // GAZ
{ Break , Break , Break , Break , GB10 , Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break , Inside, Break , Break }, // EBG
static bool shouldBreakBetweenClasses(QUnicodeTables::GraphemeBreakClass first,
QUnicodeTables::GraphemeBreakClass second)
{
return (breakTable[first] & FLAG(second)) == 0;
}

// Some rules (GB11, GB12, GB13) cannot be represented by the table alone,
// so we need to store some local state.
enum class State : uchar {
Normal,
GB11_ExtPicExt, // saw a Extend after a Extended_Pictographic
GB11_ExtPicExtZWJ, // saw a ZWG after a Extended_Pictographic and zero or more Extend
GB12_13_RI, // saw a RegionalIndicator following a non-RegionalIndicator
};

} // namespace GB

static void getGraphemeBreaks(const ushort *string, quint32 len, QCharAttributes *attributes)
{
QUnicodeTables::GraphemeBreakClass lcls = QUnicodeTables::GraphemeBreak_LF; // to meet GB1
GB::State state = GB::Break; // only required to track some of the rules
GB::State state = GB::State::Normal;
for (quint32 i = 0; i != len; ++i) {
quint32 pos = i;
uint ucs4 = string[i];
Expand All @@ -116,37 +163,55 @@ static void getGraphemeBreaks(const ushort *string, quint32 len, QCharAttributes
const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
QUnicodeTables::GraphemeBreakClass cls = (QUnicodeTables::GraphemeBreakClass) prop->graphemeBreakClass;

switch (GB::breakTable[lcls][cls]) {
case GB::Break:
attributes[pos].graphemeBoundary = true;
state = GB::Break;
break;
case GB::Inside:
state = GB::Break;
bool shouldBreak = GB::shouldBreakBetweenClasses(lcls, cls);

switch (state) {
case GB::State::Normal:
if (lcls == QUnicodeTables::GraphemeBreak_Extended_Pictographic) { // GB11
if (cls == QUnicodeTables::GraphemeBreak_Extend) {
state = GB::State::GB11_ExtPicExt;
Q_ASSERT(!shouldBreak); // GB9, do not break before Extend
} else if (cls == QUnicodeTables::GraphemeBreak_ZWJ) {
state = GB::State::GB11_ExtPicExtZWJ;
Q_ASSERT(!shouldBreak); // GB9, do not break before ZWJ
}
} else if (cls == QUnicodeTables::GraphemeBreak_RegionalIndicator) { // GB12, GB13
state = GB::State::GB12_13_RI;
}

break;
case GB::GB10:
state = GB::GB10;
case GB::State::GB11_ExtPicExt:
Q_ASSERT(lcls == QUnicodeTables::GraphemeBreak_Extend);
if (cls == QUnicodeTables::GraphemeBreak_Extend) {
// keep going in the current state
Q_ASSERT(!shouldBreak); // GB9, do not break before Extend
} else if (cls == QUnicodeTables::GraphemeBreak_ZWJ) {
state = GB::State::GB11_ExtPicExtZWJ;
Q_ASSERT(!shouldBreak); // GB9, do not break before ZWJ
}

break;
case GB::GB10_2:
if (state == GB::GB10 || state == GB::GB10_2)
state = GB::GB10_2;
else
state = GB::Break;

case GB::State::GB11_ExtPicExtZWJ:
Q_ASSERT(lcls == QUnicodeTables::GraphemeBreak_ZWJ);
if (cls == QUnicodeTables::GraphemeBreak_Extended_Pictographic)
shouldBreak = false;

state = GB::State::Normal;
break;
case GB::GB10_3:
if (state != GB::GB10 && state != GB::GB10_2)
attributes[pos].graphemeBoundary = true;
state = GB::Break;

case GB::State::GB12_13_RI:
Q_ASSERT(lcls == QUnicodeTables::GraphemeBreak_RegionalIndicator);
if (cls == QUnicodeTables::GraphemeBreak_RegionalIndicator)
shouldBreak = false;

state = GB::State::Normal;
break;
case GB::GB13:
if (state != GB::GB13) {
state = GB::GB13;
} else {
attributes[pos].graphemeBoundary = true;
state = GB::Break;
}
}

if (shouldBreak)
attributes[pos].graphemeBoundary = true;

lcls = cls;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -618,11 +618,11 @@
÷ 0061 ÷ 0600 × 0062 ÷ # ÷ [0.2] LATIN SMALL LETTER A (Other) ÷ [999.0] ARABIC NUMBER SIGN (Prepend) × [9.2] LATIN SMALL LETTER B (Other) ÷ [0.3]
÷ 1F476 × 1F3FF ÷ 1F476 ÷ # ÷ [0.2] BABY (ExtPict) × [9.0] EMOJI MODIFIER FITZPATRICK TYPE-6 (Extend) ÷ [999.0] BABY (ExtPict) ÷ [0.3]
÷ 0061 × 1F3FF ÷ 1F476 ÷ # ÷ [0.2] LATIN SMALL LETTER A (Other) × [9.0] EMOJI MODIFIER FITZPATRICK TYPE-6 (Extend) ÷ [999.0] BABY (ExtPict) ÷ [0.3]
# ÷ 0061 × 1F3FF ÷ 1F476 × 200D × 1F6D1 ÷ # ÷ [0.2] LATIN SMALL LETTER A (Other) × [9.0] EMOJI MODIFIER FITZPATRICK TYPE-6 (Extend) ÷ [999.0] BABY (ExtPict) × [9.0] ZERO WIDTH JOINER (ZWJ_ExtCccZwj) × [11.0] OCTAGONAL SIGN (ExtPict) ÷ [0.3]
# ÷ 1F476 × 1F3FF × 0308 × 200D × 1F476 × 1F3FF ÷ # ÷ [0.2] BABY (ExtPict) × [9.0] EMOJI MODIFIER FITZPATRICK TYPE-6 (Extend) × [9.0] COMBINING DIAERESIS (Extend_ExtCccZwj) × [9.0] ZERO WIDTH JOINER (ZWJ_ExtCccZwj) × [11.0] BABY (ExtPict) × [9.0] EMOJI MODIFIER FITZPATRICK TYPE-6 (Extend) ÷ [0.3]
# ÷ 1F6D1 × 200D × 1F6D1 ÷ # ÷ [0.2] OCTAGONAL SIGN (ExtPict) × [9.0] ZERO WIDTH JOINER (ZWJ_ExtCccZwj) × [11.0] OCTAGONAL SIGN (ExtPict) ÷ [0.3]
÷ 0061 × 1F3FF ÷ 1F476 × 200D × 1F6D1 ÷ # ÷ [0.2] LATIN SMALL LETTER A (Other) × [9.0] EMOJI MODIFIER FITZPATRICK TYPE-6 (Extend) ÷ [999.0] BABY (ExtPict) × [9.0] ZERO WIDTH JOINER (ZWJ_ExtCccZwj) × [11.0] OCTAGONAL SIGN (ExtPict) ÷ [0.3]
÷ 1F476 × 1F3FF × 0308 × 200D × 1F476 × 1F3FF ÷ # ÷ [0.2] BABY (ExtPict) × [9.0] EMOJI MODIFIER FITZPATRICK TYPE-6 (Extend) × [9.0] COMBINING DIAERESIS (Extend_ExtCccZwj) × [9.0] ZERO WIDTH JOINER (ZWJ_ExtCccZwj) × [11.0] BABY (ExtPict) × [9.0] EMOJI MODIFIER FITZPATRICK TYPE-6 (Extend) ÷ [0.3]
÷ 1F6D1 × 200D × 1F6D1 ÷ # ÷ [0.2] OCTAGONAL SIGN (ExtPict) × [9.0] ZERO WIDTH JOINER (ZWJ_ExtCccZwj) × [11.0] OCTAGONAL SIGN (ExtPict) ÷ [0.3]
÷ 0061 × 200D ÷ 1F6D1 ÷ # ÷ [0.2] LATIN SMALL LETTER A (Other) × [9.0] ZERO WIDTH JOINER (ZWJ_ExtCccZwj) ÷ [999.0] OCTAGONAL SIGN (ExtPict) ÷ [0.3]
# ÷ 2701 × 200D × 2701 ÷ # ÷ [0.2] UPPER BLADE SCISSORS (Other) × [9.0] ZERO WIDTH JOINER (ZWJ_ExtCccZwj) × [11.0] UPPER BLADE SCISSORS (Other) ÷ [0.3]
÷ 2701 × 200D × 2701 ÷ # ÷ [0.2] UPPER BLADE SCISSORS (Other) × [9.0] ZERO WIDTH JOINER (ZWJ_ExtCccZwj) × [11.0] UPPER BLADE SCISSORS (Other) ÷ [0.3]
÷ 0061 × 200D ÷ 2701 ÷ # ÷ [0.2] LATIN SMALL LETTER A (Other) × [9.0] ZERO WIDTH JOINER (ZWJ_ExtCccZwj) ÷ [999.0] UPPER BLADE SCISSORS (Other) ÷ [0.3]
#
# Lines: 602
Expand Down
Loading

0 comments on commit 3670fee

Please sign in to comment.