From 2472a941cfc3558d830a77413696d98c32bf68d8 Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Wed, 4 Oct 2023 18:34:20 -0500 Subject: [PATCH 1/3] =?UTF-8?q?feat(core):=20match=20any=20marker=20?= =?UTF-8?q?=F0=9F=99=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - add a regex form that matches any marker For: #9119 --- common/web/types/src/kmx/kmx-plus.ts | 4 ++-- .../types/src/ldml-keyboard/pattern-parser.ts | 12 ++++++++--- .../test/ldml-keyboard/test-pattern-parser.ts | 12 +++++++++++ core/src/ldml/C9134_ldml_markers.md | 2 ++ .../unit/ldml/keyboards/k_210_marker-test.xml | 20 ++++++++++++------- .../unit/ldml/keyboards/k_210_marker.xml | 3 +++ developer/src/kmc-ldml/src/compiler/tran.ts | 4 ++-- 7 files changed, 43 insertions(+), 14 deletions(-) diff --git a/common/web/types/src/kmx/kmx-plus.ts b/common/web/types/src/kmx/kmx-plus.ts index 1b653f2f5a0..ea36f7908ac 100644 --- a/common/web/types/src/kmx/kmx-plus.ts +++ b/common/web/types/src/kmx/kmx-plus.ts @@ -293,8 +293,8 @@ export class Vars extends Section { return v[0]; } } - substituteMarkerString(s : string) : string { - return MarkerParser.toSentinelString(s, this.markers); + substituteMarkerString(s : string, forMatch? : boolean) : string { + return MarkerParser.toSentinelString(s, this.markers, forMatch); } }; diff --git a/common/web/types/src/ldml-keyboard/pattern-parser.ts b/common/web/types/src/ldml-keyboard/pattern-parser.ts index f6013248d26..b02440553e6 100644 --- a/common/web/types/src/ldml-keyboard/pattern-parser.ts +++ b/common/web/types/src/ldml-keyboard/pattern-parser.ts @@ -64,6 +64,9 @@ export class MarkerParser { public static readonly MAX_MARKER_INDEX = constants.marker_max_index; /** Max count of markers */ public static readonly MAX_MARKER_COUNT = constants.marker_max_count; + /** Expression that matches any marker */ + public static readonly ANY_MARKER_MATCH = + this.SENTINEL + this.MARKER_CODE + `[\\u0001-\\u${this.MAX_MARKER_INDEX.toString(16)}]`; /** * Pattern for matching a marker reference, OR the special marker \m{.} @@ -91,10 +94,13 @@ export class MarkerParser { } /** @returns all marker strings as sentinel values */ - public static toSentinelString(s: string, markers?: OrderedStringList) : string { + public static toSentinelString(s: string, markers?: OrderedStringList, forMatch?: boolean) : string { if (!s) return s; return s.replaceAll(this.REFERENCE, (sub, arg) => { if (arg === MarkerParser.ANY_MARKER_ID) { + if (forMatch) { + return this.ANY_MARKER_MATCH; + } return MarkerParser.markerOutput(MarkerParser.ANY_MARKER_INDEX); } if (!markers) { @@ -103,10 +109,10 @@ export class MarkerParser { const order = markers.getItemOrder(arg); if (order === -1) { throw RangeError(`Internal Error: Could not find marker \\m{${arg}}`); - } else if(order >= MarkerParser.MAX_MARKER_INDEX) { + } else if(order > MarkerParser.MAX_MARKER_INDEX) { throw RangeError(`Internal Error: marker \\m{${arg}} has out of range index ${order}`); } else { - return MarkerParser.markerOutput(order+1); + return MarkerParser.markerOutput(order + 1); } }); } diff --git a/common/web/types/test/ldml-keyboard/test-pattern-parser.ts b/common/web/types/test/ldml-keyboard/test-pattern-parser.ts index e7f06a1c557..1ea96797a7b 100644 --- a/common/web/types/test/ldml-keyboard/test-pattern-parser.ts +++ b/common/web/types/test/ldml-keyboard/test-pattern-parser.ts @@ -75,6 +75,7 @@ describe('Test of Pattern Parsers', () => { 'a': 0, 'b': 1, 'c': 2, + 'zz': MarkerParser.MAX_MARKER_INDEX - 1, // this is an ordering, so needs to be -1 'zzz': 0x2FFFFF, }; const o = m[item]; @@ -103,6 +104,17 @@ describe('Test of Pattern Parsers', () => { markers ) ); + // verify the matching behavior of these + assert.isTrue(new RegExp(MarkerParser.toSentinelString(`Q\\m{a}`, markers, true), 'u') + .test(MarkerParser.toSentinelString(`Q\\m{a}`, markers, false)), `Q\\m{a} did not match`); + assert.isFalse(new RegExp(MarkerParser.toSentinelString(`Q\\m{a}`, markers, true), 'u') + .test(MarkerParser.toSentinelString(`Q\\m{b}`, markers, false)), `Q\\m{a} should not match Q\\m{b}`); + assert.isTrue(new RegExp(MarkerParser.toSentinelString(`Q\\m{.}`, markers, true), 'u') + .test(MarkerParser.toSentinelString(`Q\\m{a}`, markers, false)), `Q\\m{.} did not match Q\\m{a}`); + assert.isTrue(new RegExp(MarkerParser.toSentinelString(`Q\\m{.}`, markers, true), 'u') + .test(MarkerParser.toSentinelString(`Q\\m{zz}`, markers, false)), `Q\\m{.} did not match Q\\m{zz} (max marker)`); + assert.isFalse(new RegExp(MarkerParser.toSentinelString(`Q\\m{.}`, markers, true), 'u') + .test(MarkerParser.toSentinelString(`\\m{a}`, markers, false)), `Q\\m{.} did not match \\m{a}`); }); it('should match some marker constants', () => { assert.equal(constants.uc_sentinel, KMXFile.UC_SENTINEL); diff --git a/core/src/ldml/C9134_ldml_markers.md b/core/src/ldml/C9134_ldml_markers.md index 065dd0ee2f3..788cd82b500 100644 --- a/core/src/ldml/C9134_ldml_markers.md +++ b/core/src/ldml/C9134_ldml_markers.md @@ -47,6 +47,8 @@ Note that this is different from other 0-based indices in KMX+. If there are thr ## Compiler (kmc) - `U+FFFF` needs to be illegal as a literal or escaped sequence. So `\u{FFFF}` is not allowed, for example, nor as a literal in the UTF-8 .xml stream. +- Matching `\m{abc}` (some marker) will turn into a match for `U+FFFF U+0008 U+XXXX` for that match. +- Matching `\m{.}` (_any_ marker) will turn into the special sequence `U+FFFF U+0008 [U+0001-U+D7FE]` where the latter is a range ### `vars` diff --git a/core/tests/unit/ldml/keyboards/k_210_marker-test.xml b/core/tests/unit/ldml/keyboards/k_210_marker-test.xml index 3286ac11792..73083e180f5 100644 --- a/core/tests/unit/ldml/keyboards/k_210_marker-test.xml +++ b/core/tests/unit/ldml/keyboards/k_210_marker-test.xml @@ -3,7 +3,7 @@ - + - - @@ -41,6 +35,18 @@ + --> + + + + + + diff --git a/core/tests/unit/ldml/keyboards/k_210_marker.xml b/core/tests/unit/ldml/keyboards/k_210_marker.xml index 1635ce0a2a6..d411725f6d0 100644 --- a/core/tests/unit/ldml/keyboards/k_210_marker.xml +++ b/core/tests/unit/ldml/keyboards/k_210_marker.xml @@ -33,6 +33,9 @@ + + + diff --git a/developer/src/kmc-ldml/src/compiler/tran.ts b/developer/src/kmc-ldml/src/compiler/tran.ts index 9f39e302ef3..f874273f6f6 100644 --- a/developer/src/kmc-ldml/src/compiler/tran.ts +++ b/developer/src/kmc-ldml/src/compiler/tran.ts @@ -142,8 +142,8 @@ export class TransformCompiler Date: Wed, 4 Oct 2023 18:45:45 -0500 Subject: [PATCH 2/3] =?UTF-8?q?feat(core):=20more=20marker=20=F0=9F=99=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - reenable some tests For: #9119 --- .../unit/ldml/keyboards/k_210_marker-test.xml | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/core/tests/unit/ldml/keyboards/k_210_marker-test.xml b/core/tests/unit/ldml/keyboards/k_210_marker-test.xml index 73083e180f5..24f104f4d71 100644 --- a/core/tests/unit/ldml/keyboards/k_210_marker-test.xml +++ b/core/tests/unit/ldml/keyboards/k_210_marker-test.xml @@ -3,7 +3,7 @@ - + - + + + + + + + + + + From 25c20ee812a619391574a9633c8e21e243a65b9d Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Fri, 6 Oct 2023 11:58:59 -0500 Subject: [PATCH 3/3] =?UTF-8?q?feat(common):=20fix=20marker=20parsing=20?= =?UTF-8?q?=20=F0=9F=99=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - cleaned up regex For: #9119 --- .../web/types/src/ldml-keyboard/pattern-parser.ts | 10 ++++++++-- .../test/ldml-keyboard/test-pattern-parser.ts | 14 +++++++++----- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/common/web/types/src/ldml-keyboard/pattern-parser.ts b/common/web/types/src/ldml-keyboard/pattern-parser.ts index b02440553e6..b0d6fb07962 100644 --- a/common/web/types/src/ldml-keyboard/pattern-parser.ts +++ b/common/web/types/src/ldml-keyboard/pattern-parser.ts @@ -64,9 +64,15 @@ export class MarkerParser { public static readonly MAX_MARKER_INDEX = constants.marker_max_index; /** Max count of markers */ public static readonly MAX_MARKER_COUNT = constants.marker_max_count; + + private static anyMarkerMatch() : string { + const start = (`0000` + (this.MIN_MARKER_INDEX).toString(16)).slice(-4); + const end = (`0000` + (this.MAX_MARKER_INDEX).toString(16)).slice(-4); + return `${this.SENTINEL}${this.MARKER_CODE}[\\u${start}-\\u${end}]`; + } + /** Expression that matches any marker */ - public static readonly ANY_MARKER_MATCH = - this.SENTINEL + this.MARKER_CODE + `[\\u0001-\\u${this.MAX_MARKER_INDEX.toString(16)}]`; + public static readonly ANY_MARKER_MATCH = MarkerParser.anyMarkerMatch(); /** * Pattern for matching a marker reference, OR the special marker \m{.} diff --git a/common/web/types/test/ldml-keyboard/test-pattern-parser.ts b/common/web/types/test/ldml-keyboard/test-pattern-parser.ts index 1ea96797a7b..0274f2a5c27 100644 --- a/common/web/types/test/ldml-keyboard/test-pattern-parser.ts +++ b/common/web/types/test/ldml-keyboard/test-pattern-parser.ts @@ -105,16 +105,20 @@ describe('Test of Pattern Parsers', () => { ) ); // verify the matching behavior of these - assert.isTrue(new RegExp(MarkerParser.toSentinelString(`Q\\m{a}`, markers, true), 'u') + assert.isTrue(new RegExp(MarkerParser.toSentinelString(`^Q\\m{a}$`, markers, true), 'u') .test(MarkerParser.toSentinelString(`Q\\m{a}`, markers, false)), `Q\\m{a} did not match`); - assert.isFalse(new RegExp(MarkerParser.toSentinelString(`Q\\m{a}`, markers, true), 'u') + assert.isFalse(new RegExp(MarkerParser.toSentinelString(`^Q\\m{a}$`, markers, true), 'u') .test(MarkerParser.toSentinelString(`Q\\m{b}`, markers, false)), `Q\\m{a} should not match Q\\m{b}`); - assert.isTrue(new RegExp(MarkerParser.toSentinelString(`Q\\m{.}`, markers, true), 'u') + assert.isTrue(new RegExp(MarkerParser.toSentinelString(`^Q\\m{.}$`, markers, true), 'u') .test(MarkerParser.toSentinelString(`Q\\m{a}`, markers, false)), `Q\\m{.} did not match Q\\m{a}`); - assert.isTrue(new RegExp(MarkerParser.toSentinelString(`Q\\m{.}`, markers, true), 'u') + assert.isTrue(new RegExp(MarkerParser.toSentinelString(`^Q\\m{.}$`, markers, true), 'u') .test(MarkerParser.toSentinelString(`Q\\m{zz}`, markers, false)), `Q\\m{.} did not match Q\\m{zz} (max marker)`); - assert.isFalse(new RegExp(MarkerParser.toSentinelString(`Q\\m{.}`, markers, true), 'u') + assert.isFalse(new RegExp(MarkerParser.toSentinelString(`^Q\\m{.}$`, markers, true), 'u') .test(MarkerParser.toSentinelString(`\\m{a}`, markers, false)), `Q\\m{.} did not match \\m{a}`); + assert.isTrue(new RegExp(MarkerParser.toSentinelString(`^\\m{.}$`, markers, true), 'u') + .test(MarkerParser.toSentinelString(`\\m{a}`, markers, false)), `\\m{.} did not match \\m{a}`); + assert.isFalse(new RegExp(MarkerParser.toSentinelString(`^\\m{.}$`, markers, true), 'u') + .test(MarkerParser.toSentinelString(`\\m{a}\\m{b}`, markers, false)), `\\m{.} did not match \\m{a}\\m{b}`); }); it('should match some marker constants', () => { assert.equal(constants.uc_sentinel, KMXFile.UC_SENTINEL);