unicode-org · srl295 · Jan 29, 2024 · jahorton · Jan 30, 2024 · jahorton
diff --git a/docs/ldml/tr35-keyboards.md b/docs/ldml/tr35-keyboards.md
@@ -2908,6 +2908,196 @@ This attribute specifies the expected resultant text in a document after process
     <check result="abc\u{0022}s\u{0022}•éÈ"/>
 </test>
 ```
+## Appendix: Pseudocode for Marker-safe Normalization
+
+The following is an informative section with some example pseudocode for marker-safe normalization into NFD.
+
+1. Markers are not text. This pseudocode assumes an implementation which is able to distinguish markers from plain text.
+
+2. Some optimization notes are given, but the emphasis here is on a clear and correct implementation.
+
+3. A distinction would need to be made between (A) markers in text, such as a keystroke emitting `\m{someMarker}` or a marker in input context, and (B) markers in transform match strings, such as `<transform from="\m{marker}">` or `<transform from="\m{.}">`.  An implementation could pass a parameter through the following code path distinguishing A and B, this distinction is not shown, for clarity.
+
+Example use:
+
+- `nfdMarkers('e\u{0300}' + Marker('marker1') + '\u{0320}a\u{0300}' + Marker('marker2') + '\u{0320}')`
+- Returns: `'e' + Marker('marker1') + '\u{0320}\u{0300}a' + Marker('marker2') + '\u{0320}\u{0300}'`
+
+```js
+/**
+ * Normalize a string-with-markers into NFD.
+ * @param input string with markers as out-of-band
+ * @returns normalized output as a string with markers
+ */
+export function nfdMarkers(input /* string with markers */) {
+    // to accumulate output text
+    let output = "";
+    // remove all markers
+    let noMarkers = removeMarkers(input);
+    // NFD the no-marker text.
+    let noMarkersNfd = normalizeNFD(noMarkers);
+
+    // Two quick checks.
+    if (noMarkers === input) {
+        // Heuristic: if there were no markers we can safely return noMarkersNFD
+        return noMarkersNFD;
+    } else if (noMarkersNFD === noMarkers) {
+        // Heuristic: if normalization made no change, return the input string: already NFD
+        return input;
+    }
+
+    let segmentStart = 0;
+    let segmentEnd = 0;
+
+    // look for normalization-safe segments
+    // 'i' = iterate by codepoints
+    do {
+        const haveMarker = markerAt(i); // true if there is a marker here
+        if (i === input.end) {
+            segmentEnd = i; // end of input - end of segmetn
+        } else if(hasNFDBoundaryBefore(i) && !have_marker) {
+            segmentEnd = i; // end of segment
+            i++; // move past the NFD char
+        } else if(have_marker) {
+            i += /* marker */; // skip past marker
+        } else {
+            // non boundary, just move index forward
+            i++;
+        }
+
+        if (segmentStart != segmentEnd) {
+            const segment    = input.substring(segmentStart, segmentEnd);
+            const segmentNfd = nfdMarkersSegment(segment);
+            output.append(segmentNfd);
+            segmentStart = segmentEnd;
+        }
+    } while(segmentEnd != input.end);
+
+    return output;
+}
+
+/**
+ * @param input segment text
+ * @returns normalized output segment
+ */
+function nfdMarkersSegment(input /* string with markers */) {
+    const noMarkers    = removeMarkers(input);
+    const markerMap    = parseMarkers(input);
+    const noMarkersNfd = normalizeNFD(noMarkers);
+    if (markerMap.empty()) {
+        // no markers in this segment.
+        return noMarkersNfd;
+    } else if(noMarkersNfd == noMarkers) {
+        // no change in this segment
+        return input;
+    } else {
+        return addBackMarkers(noMarkersNfd, markerMap);
+    }
+}
+
+// represents a removed marker, to be re-added
+interface MarkerEntry {
+    glue: string;      // 'glue' character
+    marker: Marker;  // which marker
+    processed: boolean = false; // track whether this was processed or not
+    eot: boolean = false; // track whether end of text or not
+}
+
+/** parse the string for markers */
+function parseMarkers(s: string): MarkerEntry[] {
+    // Note: parseMarkers() and removeMarkers() (not shown) could be the same function.
+    // queue of markers found
+    let lastMarkers: Marker[] = [];
+    // return value
+    let outputMap: MarkerEntry[] = [];
+
+
+    // iterate string by codepoints
+    for (i = s.begin; i< s.end;) {
+        if (markerAt(i)) {
+            // found a marker - add it to the list
+            lastMarkers.add(markerAt(i));
+            i += /* marker length */;
+        } else {
+            // From 'i' find the glue char for all markers found so far
+            // (in order)
+            // use first char of decomposed string
+            const glue = normalizeNFD(i).codePointAt(0);
+            for (m of markers) {
+                map.add({marker: m, glue: glue});
+            }
+            markers.clear();
+
+            i++; // move to next codepoint
+        }
+    }
+
+    // ran out of codepoints.
+    // any remaining markers are EOT
+    for (m of markers) {
+        map.add({marker: m, eot: true});
+    }
+}
+
+/**
+ * re-add markers to input. Mutates markerMap.
+ * @param input normalized string to re-add markers
+ * @param markerMap list of glued markers. will be mutated
+ */
+function addBackMarkers(input : string, map: MarkerEntry[]) : stringWithMarkers {
+    // quick check
+    if(map.empty()) return input;
+
+    // output string
+    let out = '';
+
+    // first, add all of the EOT entries
+    while (!map.empty() && map[map.length - 1].eot) {
+        const m = map.popEnd(); // remove the last entry
+        out = m.marker + out; // prepend the marker
+    }
+
+    // now, process input codepoints from end to beginning
+    for (i=input.end;i>=input.begin;i--) {
+        // 0. prepend this codepoint
+        out = i + out;
+
+        // 1. any markers at the end of list which match get added
+        while (!map.empty() && map[map.length - 1].glue == i) {
+            // remove last entry, so we don't have to process it any more.
+            const m = map.popEnd();
+            if (m.processed == false) {
+                out = m.marker + out; // prepend marker
+            } // else we already processed it
+        }
+
+        // 2. look for any out of order markers, not right at the end
+        for (let m2 = map.length - 2; m2 >= 0; m2 --) {
+            const m = map[m2]; // peek at end of list
+            if (m.glue == i && !m.processed) {
+                m.processed = true; // so we don't process it again
+                out = m.marker + out; // prepend marker
+            }
+        }
+    }
+    return out;
+}
+
+// Normalization functions (to be provided)
+function isNFD(s): boolean; // return true if in NFD
+function getCanonicalCombiningClass(c): int; // return the CCC according to UAX#44
+function normalizeNFD(s): string; // return NFD of a string
+
+// other functions (not shown)
+function removeMarkers(s): string; // strip out all markers from a string
+
+/** indicates whether a character does NOT interact with prior chars in
+ * normalization. See hasBoundaryBefore() in ICU. */
+function hasNFDBoundaryBefore(s: string): boolean {
+    // See UAX #15
+    return (isNFD(s) && getCanonicalCombiningClass(s) == 0);
+}
+```
 
 * * *