keymanapp · jahorton · Jan 14, 2025 · Jan 14, 2025 · Jan 14, 2025
@@ -13,6 +13,12 @@ import Suggestion = LexicalModelTypes.Suggestion;
 import Transform = LexicalModelTypes.Transform;
 import USVString = LexicalModelTypes.USVString;
 
+/**
+ * Indicates that the owning token's 'replacement' represents part of an
+ * actually-applied suggestion.
+ */
+export const APPLIED_SUGGESTION_COMPONENT = -2;
+
 function textToCharTransforms(text: string, transformId?: number) {
   let perCharTransforms: Transform[] = [];
 
@@ -42,9 +48,15 @@ export class TrackedContextToken {
   isWhitespace?: boolean;
 
   transformDistributions: Distribution<Transform>[] = [];
-  replacements: TrackedContextSuggestion[];
+  replacements: TrackedContextSuggestion[] = [];
   activeReplacementId: number = -1;
 
+  constructor();
+  constructor(instance: TrackedContextToken);
+  constructor(instance?: TrackedContextToken) {
+    Object.assign(this, instance);
+  }
+
   get currentText(): string {
     if(this.replacementText === undefined || this.replacementText === null) {
       return this.raw;
@@ -85,7 +97,9 @@ export class TrackedContextToken {
     });
 
     this.raw = tokenText;
-    this.transformDistributions = backspacedTokenContext;
+    this.transformDistributions = backspacedTokenContext;  
+    this.activeReplacementId = null;
+    this.replacements = []
   }
 
   update(transformDistribution: Distribution<Transform>, tokenText?: USVString) {
@@ -98,6 +112,8 @@ export class TrackedContextToken {
 
     // Replace old token's raw-text with new token's raw-text.
     this.raw = tokenText;
+    this.activeReplacementId = null;
+    this.replacements = []
   }
 }
 
@@ -314,14 +330,20 @@ interface ContextMatchResult {
    * Should always be non-null if the token before the caret did not previously exist.
    */
   preservationTransform?: Transform;
+
+  headTokensRemoved: number;
+  tailTokensAdded: number;
 }
 
 export class ContextTracker extends CircularArray<TrackedContextState> {
   static attemptMatchContext(
     tokenizedContext: Token[],
     matchState: TrackedContextState,
-    transformSequenceDistribution?: Distribution<Transform[]>
+    transformSequenceDistribution?: Distribution<Transform[]>,
+    isApplyingSuggestion?: boolean
   ): ContextMatchResult {
+    isApplyingSuggestion ??= false;
+
     // Map the previous tokenized state to an edit-distance friendly version.
     let matchContext: USVString[] = matchState.toRawTokenization();
 
@@ -337,14 +359,18 @@ export class ContextTracker extends CircularArray<TrackedContextState> {
 
     let editPath = mapping.editPath();
 
-    // When the context has but two tokens, the path algorithm tends to invert
+    // When the context has but three tokens, the path algorithm tends to invert
     // 'insert' and 'substitute' from our preferred ordering for them.
     // Logically, either order makes sense... but logic for other cases is
     // far simpler if we have 'substitute' before 'insert'.
-    if(editPath.length == 2 && editPath[0] == 'insert' && editPath[1] == 'substitute') {
+    if(editPath.length == 3 && editPath[0] == 'insert' && editPath[1] == 'insert' && editPath[2] == 'substitute') {
+      editPath[0] = 'substitute';
+      editPath[2] = 'insert';
+    } else if(editPath.length == 2 && editPath[0] == 'insert' && editPath[1] == 'substitute') {
+      // This path may be reachable for languages without space-based wordbreaking.
       editPath[0] = 'substitute';
       editPath[1] = 'insert';
-    }
+    } 
 
     const firstMatch = editPath.indexOf('match');
     let lastMatch = editPath.lastIndexOf('match');
@@ -370,11 +396,14 @@ export class ContextTracker extends CircularArray<TrackedContextState> {
     // If we have a perfect match with a pre-existing context, no mutations have
     // happened; just re-use the old context state.
     if(firstMatch == 0 && lastMatch == editPath.length - 1) {
-      return { state: matchState, baseState: matchState };
+      return { state: matchState, baseState: matchState, headTokensRemoved: 0, tailTokensAdded: 0 };
     }
 
     // If mutations HAVE happened, we have work to do.
     let state = matchState;
+    if(isApplyingSuggestion) {
+      state = new TrackedContextState(state);
+    }
 
     let priorEdit: typeof editPath[0];
     let poppedTokenCount = 0;
@@ -412,6 +441,7 @@ export class ContextTracker extends CircularArray<TrackedContextState> {
     // does not land as part of the final token in the resulting context.  This
     // component should be preserved by any suggestions that get applied.
     let preservationTransform: Transform;
+    let pushedTokenCount = 0;
 
     // Now to update the end of the context window.
     for(let i = lastMatch+1; i < editPath.length; i++) {
@@ -459,7 +489,10 @@ export class ContextTracker extends CircularArray<TrackedContextState> {
           }
 
           const token = state.tokens[i - poppedTokenCount];
-          const matchToken = matchState.tokens[i];
+          const matchToken = isApplyingSuggestion ? new TrackedContextToken(matchState.tokens[i]) : matchState.tokens[i];
+          if(isApplyingSuggestion) {
+            state.tokens[i - poppedTokenCount] = matchToken;
+          }
 
           // TODO:  I'm beginning to believe that searchSpace should (eventually) be tracked
           // on the tokens, rather than on the overall 'state'.
@@ -540,6 +573,7 @@ export class ContextTracker extends CircularArray<TrackedContextState> {
 
           // Auto-replaces the search space to correspond with the new token.
           state.pushTail(pushedToken);
+          pushedTokenCount++;
           break;
         case 'match':
           // The default (Unicode) wordbreaker returns an empty token after whitespace blocks.
@@ -550,21 +584,47 @@ export class ContextTracker extends CircularArray<TrackedContextState> {
             continue;
           }
           // else 'fallthrough' / return null
-        default:
-          // No 'delete' should exist on the trailing edge of context when the
-          // context window slides.  While it can happen due to keystrokes with
-          // `deleteLeft`, we keep a cache of recent contexts - an older one will
-          // likely match sufficiently.
-          // - may see 'delete' followed by 'substitute' in such cases.
+        case 'delete':
+          // While we do keep a cache of recent contexts, logic constraints for handling
+          // multitaps makes it tricky to reliably use in all situations.
+          // It's best to handle `delete` cases directly for this reason.
+          for(let j = i + 1; j < editPath.length; j++) {
+            // If something _other_ than delete follows a 'delete' on the edit path, 
+            // we probably have a context mismatch.
+            //
+            // It's possible to construct cases where this isn't true, but it's likely not
+            // worth trying to handle such rare cases.
+            if(editPath[j] != 'delete') {
+              return null;
+            }
+          }
+
+          // If ALL that remains are deletes, we're good to go.
           //
-          // No 'transform' edits should exist within this section, either.
+          // This may not be the token at the index, but since all that remains are deletes,
+          // we'll have deleted the correct total number from the end once all iterations
+          // are done.
+          if(state == matchState) {
+            state = new TrackedContextState(state);
+          }
+
+          state.tokens.pop();
+          break;
+        default:
+          // No 'transform' edits should exist within this section.
           return null;
       }
 
       priorEdit = editPath[i];
     }
 
-    return { state, baseState: matchState, preservationTransform };
+    return { 
+      state, 
+      baseState: matchState, 
+      preservationTransform, 
+      headTokensRemoved: poppedTokenCount, 
+      tailTokensAdded: pushedTokenCount 
+    };
   }
 
   private static modelContextState(
@@ -623,7 +683,8 @@ export class ContextTracker extends CircularArray<TrackedContextState> {
   analyzeState(
     model: LexicalModel,
     context: Context,
-    transformDistribution?: Distribution<Transform>
+    transformDistribution?: Distribution<Transform>,
+    isApplyingSuggestion?: boolean
   ): ContextMatchResult {
     if(!model.traverseFromRoot) {
       // Assumption:  LexicalModel provides a valid traverseFromRoot function.  (Is technically optional)
@@ -676,7 +737,7 @@ export class ContextTracker extends CircularArray<TrackedContextState> {
           continue;
         }
 
-        let result = ContextTracker.attemptMatchContext(tokenizedContext.left, this.item(i), tokenizedDistribution);
+        let result = ContextTracker.attemptMatchContext(tokenizedContext.left, this.item(i), tokenizedDistribution, isApplyingSuggestion);
 
         if(result?.state) {
           // Keep it reasonably current!  And it's probably fine to have it more than once
@@ -704,7 +765,7 @@ export class ContextTracker extends CircularArray<TrackedContextState> {
     let state = ContextTracker.modelContextState(tokenizedContext.left, model);
     state.taggedContext = context;
     this.enqueue(state);
-    return { state, baseState: null };
+    return { state, baseState: null, headTokensRemoved: 0, tailTokensAdded: 0 };
   }
 
   clearCache() {

@@ -1,5 +1,6 @@
 import * as models from '@keymanapp/models-templates';
 import * as correction from './correction/index.js'
+import APPLIED_SUGGESTION_COMPONENT = correction.APPLIED_SUGGESTION_COMPONENT;
 
 import TransformUtils from './transformUtils.js';
 import { correctAndEnumerate, dedupeSuggestions, finalizeSuggestions, predictionAutoSelect, processSimilarity, toAnnotatedSuggestion, tupleDisplayOrderSort } from './predict-helpers.js';
@@ -14,10 +15,12 @@ import Reversion = LexicalModelTypes.Reversion;
 import Suggestion = LexicalModelTypes.Suggestion;
 import Transform = LexicalModelTypes.Transform;
 import USVString = LexicalModelTypes.USVString;
+import { tokenizeTransform } from './correction/transform-tokenization.js';
+import { deepCopy } from '@keymanapp/web-utils';
 
 export class ModelCompositor {
-  private lexicalModel: LexicalModel;
-  private contextTracker?: correction.ContextTracker;
+  readonly lexicalModel: LexicalModel;
+  readonly contextTracker?: correction.ContextTracker;
 
   static readonly MAX_SUGGESTIONS = 12;
   readonly punctuation: LexicalModelPunctuation;
@@ -129,6 +132,41 @@ export class ModelCompositor {
     const timer = this.activeTimer = new correction.ExecutionTimer(this.testMode ? Number.MAX_VALUE : SEARCH_TIMEOUT, this.testMode ? Number.MAX_VALUE : SEARCH_TIMEOUT * 1.5);
 
     const { postContextState, rawPredictions } = await correctAndEnumerate(this.contextTracker, this.lexicalModel, timer, transformDistribution, context);
+
+    // TODO:  maybe we should find a way to duplicate the checks below (re revertible contexts)
+    // and use them inside `correctAndEnumerate` - why burn literal milliseconds of compute
+    // time when we know we'll throw those results away?
+
+    // Check:  did we just reach the tail of a prior token via BKSP?
+    if(postContextState.tail.replacements?.length > 0) {
+      // Bypass standard suggestion stuff; re-offer the old suggestions + display a revert option.
+      const currentToken = postContextState.tail;
+
+      // Original suggestion set:  acquired.  But... they're based on a different context...
+      const appliedSuggestion = currentToken?.replacement;
+
+      // We added a partial, synthetic 'replacement' useful for tracking how much to revert.
+      // It should have a set, distinct ID corresponding to the check below.  This partial
+      // version was never a real suggestion; even if it were, the original is still within our list!
+      const suggestions = currentToken.replacements.filter((entry) => entry != appliedSuggestion).map((entry) => deepCopy(entry));
+
+      // Verify:  were we successfully able to set up a 'rewind' / revertible scenario?
+      if(appliedSuggestion?.suggestion?.id == APPLIED_SUGGESTION_COMPONENT) {
+        // We were?  It's GO TIME!
+
+        // First up:  we can safely use the raw insert string as the needed length to erase.
+        const deleteLeft = appliedSuggestion.suggestion.transform.insert.kmwLength();
+        suggestions.forEach((entry) => entry.suggestion.transform.deleteLeft = deleteLeft);
+
+        const keepSuggestion = suggestions.find((entry) => entry.suggestion.tag == 'keep');
+        // Convert the original 'keep' suggestion into a REVERT suggestion; this will restore the
+        // original context.
+        keepSuggestion.suggestion.tag = 'revert';
+
+        // Bypass the rest of the correction-search; just use these!
+        return suggestions.map((entry) => entry.suggestion);
+      }
+    }
 
     if(this.activeTimer == timer) {
       this.activeTimer = null;
@@ -267,9 +305,57 @@ export class ModelCompositor {
         contextState = this.contextTracker.analyzeState(this.lexicalModel, context).state;
       }
 
+      // Suggestion IDs at this level are unique.
       contextState.tail.activeReplacementId = suggestion.id;
-      let acceptedContext = models.applyTransform(suggestion.transform, context);
-      this.contextTracker.analyzeState(this.lexicalModel, acceptedContext);
+      contextState.taggedContext = context;
+
+      // Hmm... need to mark the 'accepted' aspect of this in some way...
+      // Maybe on the state itself?  No, won't work - not after multiple BKSP.
+      //
+      // We know which state it should match; all there is to do is actually do the bookkeeping.
+      const matchResults = this.contextTracker.analyzeState(
+        this.lexicalModel, 
+        context, 
+        [{
+          sample: suggestion.transform,
+          p: 1.0
+        }],
+        true
+      );
+
+      // If context-tracking handles the applied suggestion properly...
+      if(matchResults?.baseState) {
+        // Get the index of the first token altered by the suggestion being applied.
+        let substitutionTokenIndex = (contextState.tokens.length - 1) - matchResults.headTokensRemoved;
+
+        const tokenizer = determineModelTokenizer(this.lexicalModel)
+        let tokenizedApplication = tokenizeTransform(tokenizer, context, suggestion.transform);
+
+        // We build our suggestions to do whole-word replacement.  Fortunately, that means we already have
+        // the full suggestions!
+        const suggestions = contextState.tail.replacements;
+        if(suggestions && (substitutionTokenIndex + tokenizedApplication.length == matchResults.state.tokens.length)) {
+
+          for(let j = 1; j <= tokenizedApplication.length; j++) { 
+            const replacementPortion: correction.TrackedContextSuggestion = {
+              suggestion: {
+                ...suggestion,
+                id: APPLIED_SUGGESTION_COMPONENT, // Actual suggestions always present non-negative IDs; this can uniquely mark
+                // this as a component of an applied suggestion.
+                transform: tokenizedApplication.slice(0, j).reduce((accum, current) => models.buildMergedTransform(accum, current), { insert: '', deleteLeft: 0}),
+              },
+              tokenWidth: j
+            }
+
+            const token = matchResults.state.tokens[substitutionTokenIndex + j - 1];
+            // Attach our fragmented version of the suggestion - the part useful for rewinding it -
+            // as its own suggestion with a distinct, unique ID indicative of this property.
+            token.replacements = [ replacementPortion ].concat(suggestions);
+            token.activeReplacementId = APPLIED_SUGGESTION_COMPONENT; // perhaps give unique ID + overwrite the original suggestion ID.
+          }
+        }
+        // else: we're not confident we can map the replacement details safely to do reversion later
+      }
     }
 
     return reversion;