Skip to content

Commit

Permalink
Merge pull request #10517 from keymanapp/feat/core/10516-marker-norm-…
Browse files Browse the repository at this point in the history
…epic-ldml

feat(core): ldml marker normalization fix 🙀
  • Loading branch information
srl295 authored Jan 26, 2024
2 parents 35966c3 + 580e372 commit a643c48
Show file tree
Hide file tree
Showing 4 changed files with 143 additions and 9 deletions.
35 changes: 28 additions & 7 deletions core/src/ldml/ldml_markers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -381,15 +381,15 @@ KMX_DWORD parse_hex_quad(const km_core_usv hex_str[]) {
}

/** add the list to the map */
static void add_markers_to_map(marker_map &markers, char32_t marker_ch, const marker_list &list) {
inline void add_markers_to_map(marker_map &markers, char32_t marker_ch, const marker_list &list) {
for (auto i = list.begin(); i < list.end(); i++) {
// marker_ch is duplicate, but keeps the structure more shallow.
markers.emplace_back(marker_ch, *i);
}
}

/**
* Add any markers, if needed
* Add any markers, if needed. Inlined because we need to run it twice.
* @param markers marker map or nullptr
* @param last the 'last' parameter past the prior parsing
* @param end end of the input string
Expand All @@ -399,24 +399,45 @@ add_pending_markers(
marker_map *markers,
marker_list &last_markers,
const std::u32string::const_iterator &last,
const std::u32string::const_iterator &end) {
const std::u32string::const_iterator &end,
const icu::Normalizer2 *nfd) {
// quick check to see if there's no work to do.
if(markers == nullptr || last_markers.empty()) {
return;
}
/** which character this marker is 'glued' to. */
char32_t marker_ch;
if (last == end) {
// at end of text, so use a special value to indicate 'EOT'.
marker_ch = MARKER_BEFORE_EOT;
} else {
marker_ch = *last;
icu::UnicodeString decomposition;
auto ch = *last; // non-normalized character

// if the character is composed, we need to use the first decomposed char
// as the 'glue'.
if(!nfd->getDecomposition(ch, decomposition)) {
// char does not have a decomposition - so it may be used for the glue
marker_ch = ch;
} else {
// 'glue' is the first codepoint of the decomposition.
marker_ch = decomposition.char32At(0);
}
}

// now, update the map with these markers (in order) on this character.
add_markers_to_map(*markers, marker_ch, last_markers);
last_markers.clear(); // mark as already recorded
// clear the list
last_markers.clear();
}

std::u32string
remove_markers(const std::u32string &str, marker_map *markers, marker_encoding encoding) {
std::u32string out;
marker_list last_markers;
UErrorCode status = U_ZERO_ERROR;
const icu::Normalizer2 *nfd = icu::Normalizer2::getNFDInstance(status);
UASSERT_SUCCESS(status);

auto last = str.begin(); // points to the part of the string after the last matched marker
for (auto i = str.begin(); i != str.end();) {
Expand All @@ -425,7 +446,7 @@ remove_markers(const std::u32string &str, marker_map *markers, marker_encoding e
// add any markers found before this entry, but only if there is intervening
// text. This prevents the sentinel or the '\u' from becoming the attachment char.
if (i != last) {
add_pending_markers(markers, last_markers, last, str.end());
add_pending_markers(markers, last_markers, last, str.end(), nfd);
out.append(last, i); // append any non-marker text since the end of the last marker
last = i; // advance over text we've already appended
}
Expand All @@ -443,7 +464,7 @@ remove_markers(const std::u32string &str, marker_map *markers, marker_encoding e
// add any remaining pending markers.
// if last == str.end() then this wil be MARKER_BEFORE_EOT
// otherwise it will be the glue character
add_pending_markers(markers, last_markers, last, str.end());
add_pending_markers(markers, last_markers, last, str.end(), nfd);
// get the suffix between the last marker and the end (could be nothing)
out.append(last, str.end());
return out;
Expand Down
38 changes: 37 additions & 1 deletion core/tests/unit/ldml/keyboards/k_008_transform_norm-test.xml
Original file line number Diff line number Diff line change
Expand Up @@ -305,5 +305,41 @@
<check result="YES9d"/>
</test>
</tests>

<!-- separate tests for NFC-->
<tests name="regex-tests-9e">
<test name="regex-test-9e-0">
<keystroke key="9" />
<keystroke key="e" />
<keystroke key="e" />
<keystroke key="u-0300" />
<keystroke key="stampy" />
<keystroke key="e" />
<keystroke key="u-0300" /> <!-- out of order -->
<keystroke key="u-0320" />
<keystroke key="lgtm" />
<check result="YES9e" />
</test>
<test name="regex-test-9e-0">
<keystroke key="9" />
<keystroke key="e" />
<keystroke key="e" />
<keystroke key="u-0300" />
<keystroke key="stampy" />
<keystroke key="e" />
<keystroke key="u-0320" />
<keystroke key="u-0300" /> <!-- in order -->
<keystroke key="lgtm" />
<check result="YES9e" />
</test>
</tests>
<tests name="regex-tests-9f">
<test name="regex-test-9f-0">
<keystroke key="9" />
<keystroke key="f" />
<keystroke key="e" />
<keystroke key="stampy" />
<keystroke key="u-0344" />
<check result="YES9f" />
</test>
</tests>
</keyboardTest3>
6 changes: 5 additions & 1 deletion core/tests/unit/ldml/keyboards/k_008_transform_norm.xml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ https://github.com/unicode-org/cldr/blob/keyboard-preview/docs/ldml/tr35-keyboar
<key id="u-0320" output="\u{0320}" /> <!-- ◌̠ -->
<key id="u-0300" output="\u{0300}" /> <!-- ◌̀ -->
<key id="u-00e8" output="\u{00e8}" /> <!-- è -->
<key id="u-0344" output="\u{0344}" /> <!-- discouraged greek dialytika tonos -->
<key id="nfd" output="e\u{0320}\u{0300}" />
<key id="nfc" output="\u{00E8}\u{0320}" />
<key id="not-nfd" output="e\u{0300}\u{0320}" />
Expand All @@ -29,7 +30,7 @@ https://github.com/unicode-org/cldr/blob/keyboard-preview/docs/ldml/tr35-keyboar
<row keys="space" />
</layer>
<layer modifiers="shift" id="shift">
<row keys="u-0300 u-00e8 nfd nfc not-nfd stampy lgtm" />
<row keys="u-0300 u-00e8 nfd nfc not-nfd stampy lgtm u-0344" />
</layer>
</layers>

Expand Down Expand Up @@ -84,6 +85,9 @@ https://github.com/unicode-org/cldr/blob/keyboard-preview/docs/ldml/tr35-keyboar
<transform from="9c" to="9ce\u{0300}\m{stampy}\u{0320}\m{lgtm}" /> <!-- marker, denormalized -->
<transform from="9d" to="9de\u{0320}\m{stampy}\u{0300}\m{lgtm}" /> <!-- marker, NFD -->

<transform from="9e\u{00E8}\m{stampy}\u{00E8}\u{0320}\m{lgtm}" to="YES9e" /> <!-- NFC in rules-->

<transform from="9fe\m{stampy}\u{0344}" to="YES9f" />
</transformGroup>
<transformGroup>
<transform from="9ce\m{stampy}[\u{0320}][\u{0300}]\m{lgtm}" to="YES9c" />
Expand Down
73 changes: 73 additions & 0 deletions core/tests/unit/ldml/test_transforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -950,11 +950,84 @@ test_normalize() {
zassert_string_equal(dst_rem, expect_rem);
std::u32string dst_nfd = src;
assert(normalize_nfd_markers(dst_nfd));
if (dst_nfd != expect_nfd) {
std::cout << "dst: " << Debug_UnicodeString(dst_nfd) << std::endl;
std::cout << "exp: " << Debug_UnicodeString(expect_nfd) << std::endl;
}
zassert_string_equal(dst_nfd, expect_nfd);
}
{
marker_map map;
std::cout << __FILE__ << ":" << __LINE__ << " - marker-before-NFC " << std::endl;
// KA \m O -> KA \m E AA
const std::u32string src = U"\u0995\uFFFF\u0008\u0001\u09CB";
const std::u32string expect_rem = U"\u0995\u09CB";
const std::u32string expect_nfd = U"\u0995\uFFFF\u0008\u0001\u09C7\u09BE";
auto dst_rem = remove_markers(src, &map);
marker_map expm({{0x09C7, 0x1L}});
zassert_string_equal(dst_rem, expect_rem);
std::u32string dst_nfd = src;
assert(normalize_nfd_markers(dst_nfd));
if (dst_nfd != expect_nfd) {
std::cout << "dst: " << Debug_UnicodeString(dst_nfd) << std::endl;
std::cout << "exp: " << Debug_UnicodeString(expect_nfd) << std::endl;
}
zassert_string_equal(dst_nfd, expect_nfd);
assert_marker_map_equal(map, expm);
}
{
marker_map map;
std::cout << __FILE__ << ":" << __LINE__ << " - marker-before-NFC " << std::endl;
const std::u32string src = U"\u0995\u09BE\uFFFF\u0008\u0001\u09C7";
const std::u32string expect_rem = U"\u0995\u09BE\u09C7";
const std::u32string expect_nfd = src; // does not get reordered
auto dst_rem = remove_markers(src, &map);
marker_map expm({{0x09C7, 0x1L}});
zassert_string_equal(dst_rem, expect_rem);
std::u32string dst_nfd = src;
assert(normalize_nfd_markers(dst_nfd));
if (dst_nfd != expect_nfd) {
std::cout << "dst: " << Debug_UnicodeString(dst_nfd) << std::endl;
std::cout << "exp: " << Debug_UnicodeString(expect_nfd) << std::endl;
}
zassert_string_equal(dst_nfd, expect_nfd);
assert_marker_map_equal(map, expm);
}
{
marker_map map;
std::cout << __FILE__ << ":" << __LINE__ << " - marker-before-NFC " << std::endl;
const std::u32string src = U"\u0995\u09BE\uFFFF\u0008\u0001\u09C7";
const std::u32string expect_rem = U"\u0995\u09BE\u09C7";
const std::u32string expect_nfd = src; // does not get reordered
auto dst_rem = remove_markers(src, &map);
marker_map expm({{0x09C7, 0x1L}});
zassert_string_equal(dst_rem, expect_rem);
std::u32string dst_nfd = src;
assert(normalize_nfd_markers(dst_nfd));
if (dst_nfd != expect_nfd) {
std::cout << "dst: " << Debug_UnicodeString(dst_nfd) << std::endl;
std::cout << "exp: " << Debug_UnicodeString(expect_nfd) << std::endl;
}
zassert_string_equal(dst_nfd, expect_nfd);
assert_marker_map_equal(map, expm);
}
{
marker_map map;
std::cout << __FILE__ << ":" << __LINE__ << " - marker-before-greek " << std::endl;
const std::u32string src = U"\u03B5\uFFFF\u0008\u0001\u0344";
const std::u32string expect_rem = U"\u03B5\u0344";
const std::u32string expect_nfd = U"\u03B5\uFFFF\u0008\u0001\u0308\u0301";
auto dst_rem = remove_markers(src, &map);
marker_map expm({{0x0308, 0x1L}});
zassert_string_equal(dst_rem, expect_rem);
std::u32string dst_nfd = src;
assert(normalize_nfd_markers(dst_nfd));
if (dst_nfd != expect_nfd) {
std::cout << "dst: " << Debug_UnicodeString(dst_nfd) << std::endl;
std::cout << "exp: " << Debug_UnicodeString(expect_nfd) << std::endl;
}
zassert_string_equal(dst_nfd, expect_nfd);
assert_marker_map_equal(map, expm);
}

return EXIT_SUCCESS;
Expand Down

0 comments on commit a643c48

Please sign in to comment.