Skip to content

Commit

Permalink
Merge pull request #9504 from keymanapp/feat/core/9121-regex-map-epic…
Browse files Browse the repository at this point in the history
…-ldml

feat(core): implement mapped set mapping 🙀
  • Loading branch information
srl295 authored Aug 25, 2023
2 parents 4105667 + 07ded45 commit 462802a
Show file tree
Hide file tree
Showing 7 changed files with 277 additions and 41 deletions.
63 changes: 61 additions & 2 deletions core/src/kmx/kmx_plus.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include "ldml/keyboardprocessor_ldml.h"

#include <assert.h>
#include "kmx_plus.h"

namespace km {
namespace kbp {
Expand Down Expand Up @@ -386,11 +387,35 @@ COMP_KMXPLUS_ELEM::getElementList(KMX_DWORD elementNumber, KMX_DWORD &length) co

std::u16string
COMP_KMXPLUS_ELEM_ELEMENT::get_element_string() const {
assert((flags & LDML_ELEM_FLAGS_TYPE) == LDML_ELEM_FLAGS_TYPE_CHAR); // should only be called on char
assert(type() == LDML_ELEM_FLAGS_TYPE_CHAR); // should only be called on char
return COMP_KMXPLUS_STRS::str_from_char(element);
}

// Note: shared with subclass COMP_KMXPLUS_BKSP
std::deque<std::u32string>
COMP_KMXPLUS_ELEM_ELEMENT::loadAsStringList(KMX_DWORD length, const COMP_KMXPLUS_STRS &strs) const {
std::deque<std::u32string> list;
for (KMX_DWORD i = 0; i<length; i++) {
const auto &o = this[i];
std::u32string str;
if (o.type() == LDML_ELEM_FLAGS_TYPE_STR) {
// fetch the string
const auto str16 = strs.get(o.element);
str = km::kbp::kmx::u16string_to_u32string(str16);
} else {
// single char
str = std::u32string(1, (km_kbp_usv)o.element);
}
list.emplace_back(str);
}
return list;
}

KMX_DWORD
COMP_KMXPLUS_ELEM_ELEMENT::type() const {
return (flags & LDML_ELEM_FLAGS_TYPE);
}

// Note: shared with subclass COMP_KMXPLUS_BKSP
bool
COMP_KMXPLUS_TRAN::valid(KMX_DWORD _kmn_unused(length)) const {
if (header.size < sizeof(*this) + (sizeof(COMP_KMXPLUS_TRAN_GROUP) * groupCount) +
Expand Down Expand Up @@ -503,6 +528,18 @@ COMP_KMXPLUS_TRAN_Helper::setTran(const COMP_KMXPLUS_TRAN *newTran) {
is_valid = false;
assert(is_valid);
}
for(KMX_DWORD t = 0; is_valid && t < group.count; t++) {
const auto &transform = transforms[group.index + t];
if (transform.from == 0) {
DebugLog("COMP_KMXPLUS_TRAN_Helper: transform [%d].[%d] has empty 'from' string", i, t);
is_valid = false;
assert(is_valid);
} else if ((transform.mapFrom == 0) != (transform.mapTo == 0)) {
DebugLog("COMP_KMXPLUS_TRAN_Helper: transform [%d].[%d] should have neither or both mapFrom=%d/mapTo=%d", i, t, transform.mapFrom, transform.mapTo);
is_valid = false;
assert(is_valid);
}
}
} else if (group.type == LDML_TRAN_GROUP_TYPE_REORDER) {
DebugLog(" .. type=reorder");
if ((group.index >= tran->reorderCount) || (group.index + group.count > tran->reorderCount)) {
Expand All @@ -511,6 +548,15 @@ COMP_KMXPLUS_TRAN_Helper::setTran(const COMP_KMXPLUS_TRAN *newTran) {
is_valid = false;
assert(is_valid);
}
for(KMX_DWORD t = 0; is_valid && t < group.count; t++) {
const auto &reorder = reorders[group.index + t];
if (reorder.elements == 0) {
DebugLog("COMP_KMXPLUS_TRAN_Helper: reorder [%d].[%d] has elements=0", i, t);
// TODO-LDML: is this an error?
// is_valid = false;
// assert(is_valid);
}
}
} else {
DebugLog(" .. type=illegal 0x%X", group.type);
is_valid = false;
Expand Down Expand Up @@ -1248,6 +1294,19 @@ COMP_KMXPLUS_VARS::valid(KMX_DWORD _kmn_unused(length)) const {
return true;
}

const COMP_KMXPLUS_VARS_ITEM *COMP_KMXPLUS_VARS::findByStringId(KMX_DWORD strId) const {
if (strId == 0) {
return nullptr;
}
for (KMX_DWORD index = 0; index < varCount; index++) {
if (varEntries[index].id == strId) {
return &(varEntries[index]);
}
}
return nullptr;
}



} // namespace kmx
} // namespace kbp
Expand Down
26 changes: 20 additions & 6 deletions core/src/kmx/kmx_plus.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <kmx_file.h>
#include <ldml/keyboardprocessor_ldml.h>
#include <list>
#include <deque>

namespace km {
namespace kbp {
Expand Down Expand Up @@ -47,6 +48,7 @@ struct COMP_KMXPLUS_TRAN;
struct COMP_KMXPLUS_TRAN_GROUP;
struct COMP_KMXPLUS_TRAN_TRANSFORM;
struct COMP_KMXPLUS_TRAN_REORDER;
struct COMP_KMXPLUS_STRS;

struct COMP_KMXPLUS_HEADER {
KMXPLUS_IDENT ident; // 0000 Section name
Expand Down Expand Up @@ -103,6 +105,16 @@ struct COMP_KMXPLUS_ELEM_ELEMENT {
* @return std::u16string
*/
std::u16string get_element_string() const;

/**
* @brief load this[0]…this[length] as a string list
* @param length number of elements, including this one
* @return the string elements as a string array
*/
std::deque<std::u32string> loadAsStringList(KMX_DWORD length, const km::kbp::kmx::COMP_KMXPLUS_STRS &strs) const;

/** @return element type */
KMX_DWORD type() const;
};

struct COMP_KMXPLUS_ELEM_ENTRY {
Expand Down Expand Up @@ -260,8 +272,8 @@ struct COMP_KMXPLUS_TRAN_GROUP {
struct COMP_KMXPLUS_TRAN_TRANSFORM {
KMXPLUS_STR from;
KMXPLUS_STR to;
KMXPLUS_ELEM mapFrom;
KMXPLUS_ELEM mapTo;
KMXPLUS_STR mapFrom; // variable name
KMXPLUS_STR mapTo; // variable name
};

struct COMP_KMXPLUS_TRAN_REORDER {
Expand Down Expand Up @@ -341,21 +353,23 @@ struct COMP_KMXPLUS_BKSP : public COMP_KMXPLUS_TRAN {

struct COMP_KMXPLUS_VARS_ITEM {
KMX_DWORD_unaligned type;
KMX_DWORD_unaligned id;
KMX_DWORD_unaligned value;
KMX_DWORD_unaligned elem;
KMXPLUS_STR id;
KMXPLUS_STR value;
KMXPLUS_ELEM elem;
};

struct COMP_KMXPLUS_VARS {
static const KMXPLUS_IDENT IDENT = LDML_SECTIONID_VARS;
COMP_KMXPLUS_HEADER header;
KMX_DWORD_unaligned markers;
KMXPLUS_LIST markers;
KMX_DWORD_unaligned varCount;
COMP_KMXPLUS_VARS_ITEM varEntries[];
/**
* @brief True if section is valid.
*/
bool valid(KMX_DWORD length) const;

const COMP_KMXPLUS_VARS_ITEM *findByStringId(KMX_DWORD strId) const;
};

static_assert(sizeof(struct COMP_KMXPLUS_VARS) % 0x4 == 0, "Structs prior to variable part should align to 32-bit boundary");
Expand Down
153 changes: 128 additions & 25 deletions core/src/ldml/ldml_transforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -392,18 +392,74 @@ reorder_group::apply(std::u32string &str) const {
return applied;
}

transform_entry::transform_entry(const transform_entry &other) :
fFrom(other.fFrom), fTo(other.fTo), fFromPattern(nullptr) {
transform_entry::transform_entry(const transform_entry &other)
: fFrom(other.fFrom), fTo(other.fTo), fFromPattern(nullptr), fMapFromStrId(other.fMapFromStrId),
fMapToStrId(other.fMapToStrId), fMapFromList(other.fMapFromList), fMapToList(other.fMapToList) {
if (other.fFromPattern) {
// clone pattern
fFromPattern.reset(other.fFromPattern->clone());
}
}

transform_entry::transform_entry(const std::u32string &from, const std::u32string &to) : fFrom(from), fTo(to) {
transform_entry::transform_entry(const std::u32string &from, const std::u32string &to)
: fFrom(from), fTo(to), fFromPattern(nullptr), fMapFromStrId(), fMapToStrId(), fMapFromList(), fMapToList() {
assert(!fFrom.empty()); // TODO-LDML: should not happen?

init();
}

// TODO-LDML: How do we return errors from here?
transform_entry::transform_entry(
const std::u32string &from,
const std::u32string &to,
KMX_DWORD mapFrom,
KMX_DWORD mapTo,
const kmx::kmx_plus &kplus)
: fFrom(from), fTo(to), fFromPattern(nullptr), fMapFromStrId(mapFrom), fMapToStrId(mapTo) {
assert(!fFrom.empty()); // TODO-LDML: should not happen?
assert((fMapFromStrId == 0) == (fMapToStrId == 0)); // we have both or we have neither.
assert(kplus.strs != nullptr);
assert(kplus.vars != nullptr);
assert(kplus.elem != nullptr);
init();

// setup mapFrom
if (fMapFromStrId != 0) {
// Note: if we need the variable name it is available as follows,
// but isn't needed for normal processing. Could be useful for debug messages.
// auto mapFrom = kplus.strs->get(fMapFromStrId);
// auto mapTo = kplus.strs->get(fMapToStrId);

// get the vars
auto *fromVar = kplus.vars->findByStringId(fMapFromStrId);
auto *toVar = kplus.vars->findByStringId(fMapToStrId);
assert(fromVar != nullptr);
assert(toVar != nullptr);


// get the element lists
assert(fromVar->type == LDML_VARS_ENTRY_TYPE_SET);
assert(toVar->type == LDML_VARS_ENTRY_TYPE_SET);
KMX_DWORD fromLength, toLength;
auto *fromList = kplus.elem->getElementList(fromVar->elem, fromLength);
auto *toList = kplus.elem->getElementList(toVar->elem, toLength);
assert(fromLength == toLength);
assert(fromList != nullptr);
assert(toList != nullptr);

// populate the deques from the lists
fMapFromList = fromList->loadAsStringList(fromLength, *(kplus.strs));
fMapToList = toList->loadAsStringList(toLength, *(kplus.strs));
// did we get the expected items?
assert(fMapFromList.size() == fromLength);
assert(fMapToList.size() == toLength);
}
}

void
transform_entry::init() {
if (!fFrom.empty()) {
// TODO-LDML: if we have mapFrom, may need to do other processing.
const std::u16string patstr = km::kbp::kmx::u32string_to_u16string(fFrom);
UErrorCode status = U_ZERO_ERROR;
/* const */ icu::UnicodeString patustr = icu::UnicodeString(patstr.data(), (int32_t)patstr.length());
Expand All @@ -421,7 +477,7 @@ transform_entry::apply(const std::u32string &input, std::u32string &output) cons
// TODO-LDML: Also, we could cache the u16 string at the transformGroup level or higher.
UErrorCode status = U_ZERO_ERROR;
const std::u16string matchstr = km::kbp::kmx::u32string_to_u16string(input);
icu::UnicodeString matchustr = icu::UnicodeString(matchstr.data(), (int32_t)matchstr.length());
icu::UnicodeString matchustr = icu::UnicodeString(matchstr.data(), (int32_t)matchstr.length());
// TODO-LDML: create a new Matcher every time. These could be cached and reset.
std::unique_ptr<icu::RegexMatcher> matcher(fFromPattern->matcher(matchustr, status));
assert(U_SUCCESS(status));
Expand All @@ -438,25 +494,65 @@ transform_entry::apply(const std::u32string &input, std::u32string &output) cons
// extract..
const icu::UnicodeString substr = matchustr.tempSubStringBetween(matchStart, matchEnd);
// preflight to UTF-32 to get length
UErrorCode substrStatus = U_ZERO_ERROR;
UErrorCode substrStatus = U_ZERO_ERROR; // throwaway status
// we need the UTF-32 matchLen for our return.
auto matchLen = substr.toUTF32(nullptr, 0, substrStatus);

// should have matched something.
assert(matchLen > 0);
if (matchLen == 0) {
return 0;
}
// Now, we have a matchLen.

// now, do the replace.
// Convert the fTo into u16 TODO-LDML (we could cache this?)
const std::u16string rstr = km::kbp::kmx::u32string_to_u16string(fTo);
icu::UnicodeString rustr = icu::UnicodeString(rstr.data(), (int32_t)rstr.length());
// This replace will apply $1, $2 etc. TODO-LDML it will NOT handle mapFrom or mapTo.

/** this is the 'to' or other replacement string.*/
icu::UnicodeString rustr;
if (fMapFromStrId == 0) {
// Normal case: not a map.
// This replace will apply $1, $2 etc.
// Convert the fTo into u16 TODO-LDML (we could cache this?)
const std::u16string rstr = km::kbp::kmx::u32string_to_u16string(fTo);
rustr = icu::UnicodeString(rstr.data(), (int32_t)rstr.length());
} else {
// Set map case: mapping from/to

// we actually need the group(1) string here.
// this is only the content in parenthesis ()
icu::UnicodeString group1 = matcher->group(1, status);
assert(U_SUCCESS(status)); // TODO-LDML: could be a malformed from pattern
// now, how long is group1 in UTF-32, hmm?
UErrorCode preflightStatus = U_ZERO_ERROR; // throwaway status
auto group1Len = group1.toUTF32(nullptr, 0, preflightStatus);
char32_t *s = new char32_t[group1Len + 1];
assert(s != nullptr); // TODO-LDML: OOM
// convert
substr.toUTF32((UChar32 *)s, group1Len + 1, status);
assert(U_SUCCESS(status));
std::u32string match32(s, group1Len); // taken from just group1
// clean up buffer
delete [] s;

// Now we're ready to do the actual mapping.

// 1., we need to find the index in the source set.
auto matchIndex = findIndexFrom(match32);
assert(matchIndex != -1L); // TODO-LDML: not matching shouldn't happen, the regex wouldn't have matched.
// we already asserted on load that the from and to sets have the same cardinality.

// 2. get the target string, convert to utf-16
// we use the same matchIndex that was just found
const std::u16string rstr = km::kbp::kmx::u32string_to_u16string(fMapToList.at(matchIndex));

// 3. update the UnicodeString for replacement
rustr = icu::UnicodeString(rstr.data(), (int32_t)rstr.length());
// and we return to the regular code flow.
}
// here we replace the match output.
icu::UnicodeString entireOutput = matcher->replaceFirst(rustr, status);
assert(U_SUCCESS(status));
assert(U_SUCCESS(status)); // TODO-LDML: could fail here due to bad input (syntax err)

// entireOutput includes all of 'input', but modified. Need to substring it.
icu::UnicodeString outu = entireOutput.tempSubString(matchStart);

// Special case if there's no output
// Special case if there's no output, save some allocs
if (outu.length() == 0) {
output.clear();
} else {
Expand All @@ -479,6 +575,20 @@ transform_entry::apply(const std::u32string &input, std::u32string &output) cons
return matchLen;
}

int32_t transform_entry::findIndexFrom(const std::u32string &match) const {
return findIndex(match, fMapFromList);
}

int32_t transform_entry::findIndex(const std::u32string &match, const std::deque<std::u32string> list) {
int32_t index = 0;
for(auto e = list.begin(); e < list.end(); e++, index++) {
if (match == *e) {
return index;
}
}
return -1; // not found
}

any_group::any_group(const transform_group &g) : type(any_group_type::transform), transform(g), reorder() {
}

Expand Down Expand Up @@ -686,16 +796,9 @@ transforms::load(
const kmx::COMP_KMXPLUS_TRAN_TRANSFORM *element = tranHelper.getTransform(group->index + itemNumber);
const std::u32string fromStr = kmx::u16string_to_u32string(kplus.strs->get(element->from));
const std::u32string toStr = kmx::u16string_to_u32string(kplus.strs->get(element->to));
std::u16string mapFrom, mapTo;

if (element->mapFrom && element->mapTo) {
// strings: variable name of from/to
// TODO-LDML: not implemented
mapFrom = kplus.strs->get(element->mapFrom);
mapTo = kplus.strs->get(element->mapTo);
}

newGroup.emplace_back(fromStr, toStr /* ,mapFrom, mapTo */); // creating a transform_entry
KMX_DWORD mapFrom = element->mapFrom; // copy, because of alignment
KMX_DWORD mapTo = element->mapTo; // copy, because of alignment
newGroup.emplace_back(fromStr, toStr, mapFrom, mapTo, kplus); // creating a transform_entry
}
transforms->addGroup(newGroup);
} else if (group->type == LDML_TRAN_GROUP_TYPE_REORDER) {
Expand Down
Loading

0 comments on commit 462802a

Please sign in to comment.