Skip to content

Commit

Permalink
fix(web): better bad-context match handling due to tokenization shifts
Browse files Browse the repository at this point in the history
Fixes: #12494.
  • Loading branch information
Joshua Horton committed Jan 9, 2025
1 parent d702553 commit 064d753
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -309,9 +309,15 @@ interface ContextMatchResult {
/**
* Indicates the portion of the incoming keystroke data, if any, that applies to
* tokens before the last pre-caret token and thus should not be replaced by predictions
* based upon `state`.
* based upon `state`. If the provided context state + the incoming transform do not
* adequately match the current context, the match attempt will fail with a `null` result.
*
* Should always be non-null if the token before the caret did not previously exist.
* Should generally be non-null if the token before the caret did not previously exist.
*
* The result may be null if it does not match the prior context state or if bookkeeping
* based upon it is problematic - say, if wordbreaking effects shift due to new input,
* causing a mismatch with the prior state's tokenization.
* (Refer to #12494 for an example case.)
*/
preservationTransform?: Transform;
}
Expand Down Expand Up @@ -425,17 +431,36 @@ export class ContextTracker extends CircularArray<TrackedContextState> {
}
const transformDistIndex = i - (lastMatch + 1);
const tokenDistribution = transformSequenceDistribution.map((entry) => {
return {
sample: entry.sample[transformDistIndex],
p: entry.p
};
const transform = entry.sample[transformDistIndex];
if(transform) {
return {
sample: entry.sample[transformDistIndex],
p: entry.p
};
} else {
return null;
}
});

const incomingToken = tokenizedContext[i - poppedTokenCount];

// If the tokenized part of the input is a completely empty transform,
// replace it with null. This can happen with our default wordbreaker
// immediately after a whitespace. We don't want to include this
// transform as part of the input when doing correction-search.
let primaryInput = hasDistribution ? tokenDistribution[0]?.sample : null;

// If the incoming token has text but we have no transform to match it with,
// abort the matching attempt. We can't match this case well yet.
// These cases are generally infrequent enough to be low-ROI, not worth the
// investment to optimize further. (Doing so isn't the simplest.)
//
// This may happen if tokenization for pre-existing text changes due to incoming
// input - refer to #12494 for an example case.
if(!primaryInput && incomingToken.text != '') {
return null;
}

if(primaryInput && primaryInput.insert == "" && primaryInput.deleteLeft == 0 && !primaryInput.deleteRight) {
primaryInput = null;
}
Expand All @@ -451,7 +476,6 @@ export class ContextTracker extends CircularArray<TrackedContextState> {
}
const isBackspace = primaryInput && TransformUtils.isBackspace(primaryInput);

const incomingToken = tokenizedContext[i - poppedTokenCount]
switch(editPath[i]) {
case 'substitute':
if(isLastToken) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,29 @@ describe('ContextTracker', function() {
assert.isNotEmpty(state.tokens[state.tokens.length - 2].transformDistributions);
assert.isEmpty(state.tokens[state.tokens.length - 1].transformDistributions);
});

it('rejects hard-to-handle case: tail token is split into three rather than two', function() {
let baseContext = models.tokenize(defaultBreaker, {
left: "text'"
});
assert.equal(baseContext.left.length, 1);
let baseContextMatch = ContextTracker.modelContextState(baseContext.left);

// Now the actual check.
let newContext = models.tokenize(defaultBreaker, {
left: "text'\""
});
// The reason it's a problem - current internal logic isn't prepared to shift
// from 1 to 3 tokens in a single step.
assert.equal(newContext.left.length, 3);

let transform = {
insert: '\"',
deleteLeft: 0
}
let problemContextMatch = ContextTracker.attemptMatchContext(newContext.left, baseContextMatch, toWrapperDistribution(transform));
assert.isNull(problemContextMatch);
});
});

describe('modelContextState', function() {
Expand Down

0 comments on commit 064d753

Please sign in to comment.