-
Notifications
You must be signed in to change notification settings - Fork 152
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Closes #4357 Convert K Unicode-based regex to an equivalent Flex byte-based regex: - Outside of a character class, just parenthesize to keep the bytes grouped - `r"π*"` becomes `r"(\xF0\x9F\x98\x8A)*"` - Inside a non-negated character class, factor any single Unicode character out into an explicit `|` - `r"[aπb]"` becomes `r"(\xF0\x9F\x98\x8A)|[ab]"` - In all other cases (character ranges and negated character classes), report an error if there are non-ASCII characters Additionally, - Check that character ranges `[c1-c2]` have `codepoint(c1) <= codepoint(c2)` - Check that numeric ranges `r{n,m}` have `n <= m` The commit history is incremental, and I'd recommend reviewing commit-by-commit. --------- Co-authored-by: Bruce Collie <[email protected]>
- Loading branch information
1 parent
e974730
commit 94686d5
Showing
16 changed files
with
540 additions
and
160 deletions.
There are no files selected for viewing
8 changes: 8 additions & 0 deletions
8
k-distribution/tests/regression-new/checks/checkRegexRanges.k
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
// Copyright (c) Runtime Verification, Inc. All Rights Reserved. | ||
module CHECKREGEXRANGES | ||
syntax lexical Foo = r"a|[1-#]b" | ||
syntax Bar ::= r"a|[1-#]b" | ||
|
||
syntax lexical Baz = r"(a|b|c){100,1}" | ||
syntax Buz ::= r"(a|b|c){100,1}" | ||
endmodule |
21 changes: 21 additions & 0 deletions
21
k-distribution/tests/regression-new/checks/checkRegexRanges.k.out
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
[Error] Outer Parser: Invalid character range '1-#'. Start of range U+0031 is greater than end of range U+0023. | ||
Source(checkRegexRanges.k) | ||
Location(3,1,3,33) | ||
3 | syntax lexical Foo = r"a|[1-#]b" | ||
. ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
[Error] Outer Parser: Invalid character range '1-#'. Start of range U+0031 is greater than end of range U+0023. | ||
Source(checkRegexRanges.k) | ||
Location(4,16,4,27) | ||
4 | syntax Bar ::= r"a|[1-#]b" | ||
. ^~~~~~~~~~~ | ||
[Error] Outer Parser: Invalid numeric range '(a|(b|c)){100,1}'. Start of range 100 is greater than end of range 1. | ||
Source(checkRegexRanges.k) | ||
Location(6,1,6,39) | ||
6 | syntax lexical Baz = r"(a|b|c){100,1}" | ||
. ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
[Error] Outer Parser: Invalid numeric range '(a|(b|c)){100,1}'. Start of range 100 is greater than end of range 1. | ||
Source(checkRegexRanges.k) | ||
Location(7,16,7,33) | ||
7 | syntax Buz ::= r"(a|b|c){100,1}" | ||
. ^~~~~~~~~~~~~~~~~ | ||
[Error] Compiler: Had 4 parsing errors. |
5 changes: 5 additions & 0 deletions
5
k-distribution/tests/regression-new/checks/checkRegexUnicode.k
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
// Copyright (c) Runtime Verification, Inc. All Rights Reserved. | ||
module CHECKREGEXUNICODE | ||
syntax lexical Foo = r"[^aπ][π¦-ab-π]" | ||
syntax Bar ::= r"[^aπ][π¦-ab-π]" [token] | ||
endmodule |
31 changes: 31 additions & 0 deletions
31
k-distribution/tests/regression-new/checks/checkRegexUnicode.k.out
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
[Error] Outer Parser: Invalid character range 'π¦-a'. Start of range U+1F626 is greater than end of range U+0061. | ||
Source(checkRegexUnicode.k) | ||
Location(3,1,3,41) | ||
3 | syntax lexical Foo = r"[^aπ][π¦-ab-π]" | ||
. ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
[Error] Outer Parser: Unsupported non-ASCII characters found in character class range: [π¦, π] | ||
Source(checkRegexUnicode.k) | ||
Location(3,1,3,41) | ||
3 | syntax lexical Foo = r"[^aπ][π¦-ab-π]" | ||
. ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
[Error] Outer Parser: Unsupported non-ASCII characters found in negated character class: [π] | ||
Source(checkRegexUnicode.k) | ||
Location(3,1,3,41) | ||
3 | syntax lexical Foo = r"[^aπ][π¦-ab-π]" | ||
. ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
[Error] Outer Parser: Invalid character range 'π¦-a'. Start of range U+1F626 is greater than end of range U+0061. | ||
Source(checkRegexUnicode.k) | ||
Location(4,16,4,43) | ||
4 | syntax Bar ::= r"[^aπ][π¦-ab-π]" [token] | ||
. ^~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
[Error] Outer Parser: Unsupported non-ASCII characters found in character class range: [π¦, π] | ||
Source(checkRegexUnicode.k) | ||
Location(4,16,4,43) | ||
4 | syntax Bar ::= r"[^aπ][π¦-ab-π]" [token] | ||
. ^~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
[Error] Outer Parser: Unsupported non-ASCII characters found in negated character class: [π] | ||
Source(checkRegexUnicode.k) | ||
Location(4,16,4,43) | ||
4 | syntax Bar ::= r"[^aπ][π¦-ab-π]" [token] | ||
. ^~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
[Error] Compiler: Had 6 parsing errors. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
ππ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
<k> | ||
ππ¦π¦ ~> .K | ||
</k> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
DEF=test | ||
EXT=test | ||
TESTDIR=. | ||
KOMPILE_BACKEND=llvm | ||
|
||
include ../../../include/kframework/ktest.mak |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
// Copyright (c) Runtime Verification, Inc. All Rights Reserved. | ||
module TEST-SYNTAX | ||
syntax lexical Emote = r"π?[π¦π]+" | ||
|
||
syntax Emoji ::= r"{Emote}" [token] | ||
|
||
endmodule | ||
|
||
module TEST | ||
imports TEST-SYNTAX | ||
configuration <k> $PGM:Emoji </k> | ||
rule <k>ππ => ππ¦π¦</k> | ||
endmodule |
147 changes: 0 additions & 147 deletions
147
k-frontend/src/main/java/org/kframework/compile/checks/CheckLexicalIdentifiers.java
This file was deleted.
Oops, something went wrong.
Oops, something went wrong.