fix(web): better bad-context match handling due to tokenization shifts

Fixes: #12494.
keymanapp · Jan 9, 2025 · 064d753 · 064d753
1 parent d702553
commit 064d753
Show file tree

Hide file tree

Showing 2 changed files with 54 additions and 7 deletions.
diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tracker.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tracker.ts
@@ -309,9 +309,15 @@ interface ContextMatchResult {
   /**
    * Indicates the portion of the incoming keystroke data, if any, that applies to
    * tokens before the last pre-caret token and thus should not be replaced by predictions
-   * based upon `state`.
+   * based upon `state`.  If the provided context state + the incoming transform do not 
+   * adequately match the current context, the match attempt will fail with a `null` result.
    *
-   * Should always be non-null if the token before the caret did not previously exist.
+   * Should generally be non-null if the token before the caret did not previously exist.
+   * 
+   * The result may be null if it does not match the prior context state or if bookkeeping 
+   * based upon it is problematic - say, if wordbreaking effects shift due to new input, 
+   * causing a mismatch with the prior state's tokenization.  
+   * (Refer to #12494 for an example case.)
    */
   preservationTransform?: Transform;
 }
@@ -425,17 +431,36 @@ export class ContextTracker extends CircularArray<TrackedContextState> {
       }
       const transformDistIndex = i - (lastMatch + 1);
       const tokenDistribution = transformSequenceDistribution.map((entry) => {
-        return {
-          sample: entry.sample[transformDistIndex],
-          p: entry.p
-        };
+        const transform = entry.sample[transformDistIndex];
+        if(transform) {
+          return {
+            sample: entry.sample[transformDistIndex],
+            p: entry.p
+          };
+        } else {
+          return null;
+        }
       });
 
+      const incomingToken = tokenizedContext[i - poppedTokenCount];
+
       // If the tokenized part of the input is a completely empty transform,
       // replace it with null.  This can happen with our default wordbreaker
       // immediately after a whitespace.  We don't want to include this
       // transform as part of the input when doing correction-search.
       let primaryInput = hasDistribution ? tokenDistribution[0]?.sample : null;
+
+      // If the incoming token has text but we have no transform to match it with,
+      // abort the matching attempt.  We can't match this case well yet.
+      // These cases are generally infrequent enough to be low-ROI, not worth the
+      // investment to optimize further.  (Doing so isn't the simplest.)
+      //
+      // This may happen if tokenization for pre-existing text changes due to incoming
+      // input - refer to #12494 for an example case.
+      if(!primaryInput && incomingToken.text != '') {
+        return null;
+      }
+
       if(primaryInput && primaryInput.insert == "" && primaryInput.deleteLeft == 0 && !primaryInput.deleteRight) {
         primaryInput = null;
       }
@@ -451,7 +476,6 @@ export class ContextTracker extends CircularArray<TrackedContextState> {
       }
       const isBackspace = primaryInput && TransformUtils.isBackspace(primaryInput);
 
-      const incomingToken = tokenizedContext[i - poppedTokenCount]
       switch(editPath[i]) {
         case 'substitute':
           if(isLastToken) {

diff --git a/...gine/predictive-text/worker-thread/src/tests/mocha/cases/edit-distance/context-tracker.js b/...gine/predictive-text/worker-thread/src/tests/mocha/cases/edit-distance/context-tracker.js
@@ -152,6 +152,29 @@ describe('ContextTracker', function() {
       assert.isNotEmpty(state.tokens[state.tokens.length - 2].transformDistributions);
       assert.isEmpty(state.tokens[state.tokens.length - 1].transformDistributions);
     });
+
+    it('rejects hard-to-handle case: tail token is split into three rather than two', function() {
+      let baseContext = models.tokenize(defaultBreaker, {
+        left: "text'"
+      });
+      assert.equal(baseContext.left.length, 1);
+      let baseContextMatch = ContextTracker.modelContextState(baseContext.left);
+
+      // Now the actual check.
+      let newContext = models.tokenize(defaultBreaker, {
+        left: "text'\""
+      });
+      // The reason it's a problem - current internal logic isn't prepared to shift 
+      // from 1 to 3 tokens in a single step.
+      assert.equal(newContext.left.length, 3);
+
+      let transform = {
+        insert: '\"',
+        deleteLeft: 0
+      }
+      let problemContextMatch = ContextTracker.attemptMatchContext(newContext.left, baseContextMatch, toWrapperDistribution(transform));
+      assert.isNull(problemContextMatch);
+    });
   });
 
   describe('modelContextState', function() {