diff --git a/docs/ldml/tr35-keyboards.md b/docs/ldml/tr35-keyboards.md index 8dabaedf22c..af088c64346 100644 --- a/docs/ldml/tr35-keyboards.md +++ b/docs/ldml/tr35-keyboards.md @@ -2908,6 +2908,196 @@ This attribute specifies the expected resultant text in a document after process ``` +## Appendix: Pseudocode for Marker-safe Normalization + +The following is an informative section with some example pseudocode for marker-safe normalization into NFD. + +1. Markers are not text. This pseudocode assumes an implementation which is able to distinguish markers from plain text. + +2. Some optimization notes are given, but the emphasis here is on a clear and correct implementation. + +3. A distinction would need to be made between (A) markers in text, such as a keystroke emitting `\m{someMarker}` or a marker in input context, and (B) markers in transform match strings, such as `` or ``. An implementation could pass a parameter through the following code path distinguishing A and B, this distinction is not shown, for clarity. + +Example use: + +- `nfdMarkers('e\u{0300}' + Marker('marker1') + '\u{0320}a\u{0300}' + Marker('marker2') + '\u{0320}')` +- Returns: `'e' + Marker('marker1') + '\u{0320}\u{0300}a' + Marker('marker2') + '\u{0320}\u{0300}'` + +```js +/** + * Normalize a string-with-markers into NFD. + * @param input string with markers as out-of-band + * @returns normalized output as a string with markers + */ +export function nfdMarkers(input /* string with markers */) { + // to accumulate output text + let output = ""; + // remove all markers + let noMarkers = removeMarkers(input); + // NFD the no-marker text. + let noMarkersNfd = normalizeNFD(noMarkers); + + // Two quick checks. + if (noMarkers === input) { + // Heuristic: if there were no markers we can safely return noMarkersNFD + return noMarkersNFD; + } else if (noMarkersNFD === noMarkers) { + // Heuristic: if normalization made no change, return the input string: already NFD + return input; + } + + let segmentStart = 0; + let segmentEnd = 0; + + // look for normalization-safe segments + // 'i' = iterate by codepoints + do { + const haveMarker = markerAt(i); // true if there is a marker here + if (i === input.end) { + segmentEnd = i; // end of input - end of segmetn + } else if(hasNFDBoundaryBefore(i) && !have_marker) { + segmentEnd = i; // end of segment + i++; // move past the NFD char + } else if(have_marker) { + i += /* marker */; // skip past marker + } else { + // non boundary, just move index forward + i++; + } + + if (segmentStart != segmentEnd) { + const segment = input.substring(segmentStart, segmentEnd); + const segmentNfd = nfdMarkersSegment(segment); + output.append(segmentNfd); + segmentStart = segmentEnd; + } + } while(segmentEnd != input.end); + + return output; +} + +/** + * @param input segment text + * @returns normalized output segment + */ +function nfdMarkersSegment(input /* string with markers */) { + const noMarkers = removeMarkers(input); + const markerMap = parseMarkers(input); + const noMarkersNfd = normalizeNFD(noMarkers); + if (markerMap.empty()) { + // no markers in this segment. + return noMarkersNfd; + } else if(noMarkersNfd == noMarkers) { + // no change in this segment + return input; + } else { + return addBackMarkers(noMarkersNfd, markerMap); + } +} + +// represents a removed marker, to be re-added +interface MarkerEntry { + glue: string; // 'glue' character + marker: Marker; // which marker + processed: boolean = false; // track whether this was processed or not + eot: boolean = false; // track whether end of text or not +} + +/** parse the string for markers */ +function parseMarkers(s: string): MarkerEntry[] { + // Note: parseMarkers() and removeMarkers() (not shown) could be the same function. + // queue of markers found + let lastMarkers: Marker[] = []; + // return value + let outputMap: MarkerEntry[] = []; + + + // iterate string by codepoints + for (i = s.begin; i< s.end;) { + if (markerAt(i)) { + // found a marker - add it to the list + lastMarkers.add(markerAt(i)); + i += /* marker length */; + } else { + // From 'i' find the glue char for all markers found so far + // (in order) + // use first char of decomposed string + const glue = normalizeNFD(i).codePointAt(0); + for (m of markers) { + map.add({marker: m, glue: glue}); + } + markers.clear(); + + i++; // move to next codepoint + } + } + + // ran out of codepoints. + // any remaining markers are EOT + for (m of markers) { + map.add({marker: m, eot: true}); + } +} + +/** + * re-add markers to input. Mutates markerMap. + * @param input normalized string to re-add markers + * @param markerMap list of glued markers. will be mutated + */ +function addBackMarkers(input : string, map: MarkerEntry[]) : stringWithMarkers { + // quick check + if(map.empty()) return input; + + // output string + let out = ''; + + // first, add all of the EOT entries + while (!map.empty() && map[map.length - 1].eot) { + const m = map.popEnd(); // remove the last entry + out = m.marker + out; // prepend the marker + } + + // now, process input codepoints from end to beginning + for (i=input.end;i>=input.begin;i--) { + // 0. prepend this codepoint + out = i + out; + + // 1. any markers at the end of list which match get added + while (!map.empty() && map[map.length - 1].glue == i) { + // remove last entry, so we don't have to process it any more. + const m = map.popEnd(); + if (m.processed == false) { + out = m.marker + out; // prepend marker + } // else we already processed it + } + + // 2. look for any out of order markers, not right at the end + for (let m2 = map.length - 2; m2 >= 0; m2 --) { + const m = map[m2]; // peek at end of list + if (m.glue == i && !m.processed) { + m.processed = true; // so we don't process it again + out = m.marker + out; // prepend marker + } + } + } + return out; +} + +// Normalization functions (to be provided) +function isNFD(s): boolean; // return true if in NFD +function getCanonicalCombiningClass(c): int; // return the CCC according to UAX#44 +function normalizeNFD(s): string; // return NFD of a string + +// other functions (not shown) +function removeMarkers(s): string; // strip out all markers from a string + +/** indicates whether a character does NOT interact with prior chars in + * normalization. See hasBoundaryBefore() in ICU. */ +function hasNFDBoundaryBefore(s: string): boolean { + // See UAX #15 + return (isNFD(s) && getCanonicalCombiningClass(s) == 0); +} +``` * * *