-
Notifications
You must be signed in to change notification settings - Fork 387
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
CLDR-17192 kbd: Separate PR for marker pseudocode #3470
Closed
Closed
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2908,6 +2908,196 @@ This attribute specifies the expected resultant text in a document after process | |
<check result="abc\u{0022}s\u{0022}•éÈ"/> | ||
</test> | ||
``` | ||
## Appendix: Pseudocode for Marker-safe Normalization | ||
|
||
The following is an informative section with some example pseudocode for marker-safe normalization into NFD. | ||
|
||
1. Markers are not text. This pseudocode assumes an implementation which is able to distinguish markers from plain text. | ||
|
||
2. Some optimization notes are given, but the emphasis here is on a clear and correct implementation. | ||
|
||
3. A distinction would need to be made between (A) markers in text, such as a keystroke emitting `\m{someMarker}` or a marker in input context, and (B) markers in transform match strings, such as `<transform from="\m{marker}">` or `<transform from="\m{.}">`. An implementation could pass a parameter through the following code path distinguishing A and B, this distinction is not shown, for clarity. | ||
|
||
Example use: | ||
|
||
- `nfdMarkers('e\u{0300}' + Marker('marker1') + '\u{0320}a\u{0300}' + Marker('marker2') + '\u{0320}')` | ||
- Returns: `'e' + Marker('marker1') + '\u{0320}\u{0300}a' + Marker('marker2') + '\u{0320}\u{0300}'` | ||
|
||
```js | ||
/** | ||
* Normalize a string-with-markers into NFD. | ||
* @param input string with markers as out-of-band | ||
* @returns normalized output as a string with markers | ||
*/ | ||
export function nfdMarkers(input /* string with markers */) { | ||
// to accumulate output text | ||
let output = ""; | ||
// remove all markers | ||
let noMarkers = removeMarkers(input); | ||
// NFD the no-marker text. | ||
let noMarkersNfd = normalizeNFD(noMarkers); | ||
|
||
// Two quick checks. | ||
if (noMarkers === input) { | ||
// Heuristic: if there were no markers we can safely return noMarkersNFD | ||
return noMarkersNFD; | ||
} else if (noMarkersNFD === noMarkers) { | ||
// Heuristic: if normalization made no change, return the input string: already NFD | ||
return input; | ||
} | ||
|
||
let segmentStart = 0; | ||
let segmentEnd = 0; | ||
|
||
// look for normalization-safe segments | ||
// 'i' = iterate by codepoints | ||
do { | ||
const haveMarker = markerAt(i); // true if there is a marker here | ||
if (i === input.end) { | ||
segmentEnd = i; // end of input - end of segmetn | ||
} else if(hasNFDBoundaryBefore(i) && !have_marker) { | ||
segmentEnd = i; // end of segment | ||
i++; // move past the NFD char | ||
} else if(have_marker) { | ||
i += /* marker */; // skip past marker | ||
} else { | ||
// non boundary, just move index forward | ||
i++; | ||
} | ||
|
||
if (segmentStart != segmentEnd) { | ||
const segment = input.substring(segmentStart, segmentEnd); | ||
const segmentNfd = nfdMarkersSegment(segment); | ||
output.append(segmentNfd); | ||
segmentStart = segmentEnd; | ||
} | ||
} while(segmentEnd != input.end); | ||
|
||
return output; | ||
} | ||
|
||
/** | ||
* @param input segment text | ||
* @returns normalized output segment | ||
*/ | ||
function nfdMarkersSegment(input /* string with markers */) { | ||
const noMarkers = removeMarkers(input); | ||
const markerMap = parseMarkers(input); | ||
const noMarkersNfd = normalizeNFD(noMarkers); | ||
if (markerMap.empty()) { | ||
// no markers in this segment. | ||
return noMarkersNfd; | ||
} else if(noMarkersNfd == noMarkers) { | ||
// no change in this segment | ||
return input; | ||
} else { | ||
return addBackMarkers(noMarkersNfd, markerMap); | ||
} | ||
} | ||
|
||
// represents a removed marker, to be re-added | ||
interface MarkerEntry { | ||
glue: string; // 'glue' character | ||
marker: Marker; // which marker | ||
processed: boolean = false; // track whether this was processed or not | ||
eot: boolean = false; // track whether end of text or not | ||
} | ||
|
||
/** parse the string for markers */ | ||
function parseMarkers(s: string): MarkerEntry[] { | ||
// Note: parseMarkers() and removeMarkers() (not shown) could be the same function. | ||
// queue of markers found | ||
let lastMarkers: Marker[] = []; | ||
// return value | ||
let outputMap: MarkerEntry[] = []; | ||
|
||
|
||
// iterate string by codepoints | ||
for (i = s.begin; i< s.end;) { | ||
if (markerAt(i)) { | ||
// found a marker - add it to the list | ||
lastMarkers.add(markerAt(i)); | ||
i += /* marker length */; | ||
} else { | ||
// From 'i' find the glue char for all markers found so far | ||
// (in order) | ||
// use first char of decomposed string | ||
const glue = normalizeNFD(i).codePointAt(0); | ||
for (m of markers) { | ||
map.add({marker: m, glue: glue}); | ||
} | ||
markers.clear(); | ||
|
||
i++; // move to next codepoint | ||
} | ||
} | ||
|
||
// ran out of codepoints. | ||
// any remaining markers are EOT | ||
for (m of markers) { | ||
map.add({marker: m, eot: true}); | ||
} | ||
} | ||
|
||
/** | ||
* re-add markers to input. Mutates markerMap. | ||
* @param input normalized string to re-add markers | ||
* @param markerMap list of glued markers. will be mutated | ||
*/ | ||
function addBackMarkers(input : string, map: MarkerEntry[]) : stringWithMarkers { | ||
// quick check | ||
if(map.empty()) return input; | ||
|
||
// output string | ||
let out = ''; | ||
|
||
// first, add all of the EOT entries | ||
while (!map.empty() && map[map.length - 1].eot) { | ||
const m = map.popEnd(); // remove the last entry | ||
out = m.marker + out; // prepend the marker | ||
} | ||
|
||
// now, process input codepoints from end to beginning | ||
for (i=input.end;i>=input.begin;i--) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since this code's been spun off, here's a link to my old comment re: this section - #3445 (comment) |
||
// 0. prepend this codepoint | ||
out = i + out; | ||
|
||
// 1. any markers at the end of list which match get added | ||
while (!map.empty() && map[map.length - 1].glue == i) { | ||
// remove last entry, so we don't have to process it any more. | ||
const m = map.popEnd(); | ||
if (m.processed == false) { | ||
out = m.marker + out; // prepend marker | ||
} // else we already processed it | ||
} | ||
|
||
// 2. look for any out of order markers, not right at the end | ||
for (let m2 = map.length - 2; m2 >= 0; m2 --) { | ||
const m = map[m2]; // peek at end of list | ||
if (m.glue == i && !m.processed) { | ||
m.processed = true; // so we don't process it again | ||
out = m.marker + out; // prepend marker | ||
} | ||
} | ||
} | ||
return out; | ||
} | ||
|
||
// Normalization functions (to be provided) | ||
function isNFD(s): boolean; // return true if in NFD | ||
function getCanonicalCombiningClass(c): int; // return the CCC according to UAX#44 | ||
function normalizeNFD(s): string; // return NFD of a string | ||
|
||
// other functions (not shown) | ||
function removeMarkers(s): string; // strip out all markers from a string | ||
|
||
/** indicates whether a character does NOT interact with prior chars in | ||
* normalization. See hasBoundaryBefore() in ICU. */ | ||
function hasNFDBoundaryBefore(s: string): boolean { | ||
// See UAX #15 | ||
return (isNFD(s) && getCanonicalCombiningClass(s) == 0); | ||
} | ||
``` | ||
|
||
* * * | ||
|
||
|
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Reminder of a point from the original PR: #3445 (comment)