-
Notifications
You must be signed in to change notification settings - Fork 74
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
documentation: use generic lexer infrastructure in Toy parser
- Loading branch information
1 parent
2ef383b
commit adc0fa4
Showing
4 changed files
with
251 additions
and
198 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,126 +1,196 @@ | ||
import re | ||
from dataclasses import dataclass | ||
from enum import Enum, auto | ||
from pathlib import Path | ||
|
||
from .location import Location | ||
|
||
|
||
@dataclass(init=False) | ||
class Token: | ||
file: Path | ||
line: int | ||
col: int | ||
text: str | ||
|
||
def __init__(self, file: Path, line: int, col: int, text: str): | ||
self.file = file | ||
self.line = line | ||
self.col = col | ||
self.text = text | ||
|
||
@property | ||
def loc(self): | ||
return Location(self.file, self.line, self.col) | ||
|
||
@classmethod | ||
def name(cls): | ||
return cls.__name__ | ||
|
||
|
||
@dataclass | ||
class IdentifierToken(Token): | ||
pass | ||
|
||
|
||
@dataclass | ||
class NumberToken(Token): | ||
value: float | ||
|
||
|
||
@dataclass | ||
class OperatorToken(Token): | ||
pass | ||
|
||
|
||
@dataclass | ||
class SpecialToken(Token): | ||
pass | ||
|
||
|
||
@dataclass | ||
class EOFToken(Token): | ||
pass | ||
|
||
from string import hexdigits | ||
from typing import TypeAlias, cast | ||
|
||
from xdsl.utils.exceptions import ParseError | ||
from xdsl.utils.lexer import Input, Lexer, Position, Span, Token | ||
|
||
|
||
class ToyTokenKind(Enum): | ||
SEMICOLON = auto() | ||
PARENTHESE_OPEN = auto() | ||
PARENTHESE_CLOSE = auto() | ||
BRACKET_OPEN = auto() | ||
BRACKET_CLOSE = auto() | ||
SBRACKET_OPEN = auto() | ||
SBRACKET_CLOSE = auto() | ||
LT = auto() | ||
GT = auto() | ||
EQ = auto() | ||
COMMA = auto() | ||
|
||
EOF = auto() | ||
|
||
RETURN = auto() | ||
VAR = auto() | ||
DEF = auto() | ||
|
||
IDENTIFIER = auto() | ||
NUMBER = auto() | ||
OPERATOR = auto() | ||
|
||
|
||
SINGLE_CHAR_TOKENS = { | ||
";": ToyTokenKind.SEMICOLON, | ||
"(": ToyTokenKind.PARENTHESE_OPEN, | ||
")": ToyTokenKind.PARENTHESE_CLOSE, | ||
"{": ToyTokenKind.BRACKET_OPEN, | ||
"}": ToyTokenKind.BRACKET_CLOSE, | ||
"[": ToyTokenKind.SBRACKET_OPEN, | ||
"]": ToyTokenKind.SBRACKET_CLOSE, | ||
"<": ToyTokenKind.LT, | ||
">": ToyTokenKind.GT, | ||
"=": ToyTokenKind.EQ, | ||
",": ToyTokenKind.COMMA, | ||
"+": ToyTokenKind.OPERATOR, | ||
"-": ToyTokenKind.OPERATOR, | ||
"*": ToyTokenKind.OPERATOR, | ||
"/": ToyTokenKind.OPERATOR, | ||
} | ||
|
||
IDENTIFIER_CHARS = re.compile(r"[\w]|[\d]|_") | ||
OPERATOR_CHARS = set("+-*/") | ||
SPECIAL_CHARS = set("<>}{(),;=[]") | ||
SPECIAL_CHARS = set(",") | ||
|
||
|
||
ToyToken: TypeAlias = Token[ToyTokenKind] | ||
|
||
|
||
class ToyLexer(Lexer[ToyTokenKind]): | ||
def _is_in_bounds(self, size: Position = 1) -> bool: | ||
""" | ||
Check if the current position is within the bounds of the input. | ||
""" | ||
return self.pos + size - 1 < self.input.len | ||
|
||
def _get_chars(self, size: int = 1) -> str | None: | ||
""" | ||
Get the character at the current location, or multiple characters ahead. | ||
Return None if the position is out of bounds. | ||
""" | ||
res = self.input.slice(self.pos, self.pos + size) | ||
self.pos += size | ||
return res | ||
|
||
def _peek_chars(self, size: int = 1) -> str | None: | ||
""" | ||
Peek at the character at the current location, or multiple characters ahead. | ||
Return None if the position is out of bounds. | ||
""" | ||
return self.input.slice(self.pos, self.pos + size) | ||
|
||
def _consume_chars(self, size: int = 1) -> None: | ||
""" | ||
Advance the lexer position in the input by the given amount. | ||
""" | ||
self.pos += size | ||
|
||
def _consume_regex(self, regex: re.Pattern[str]) -> re.Match[str] | None: | ||
""" | ||
Advance the lexer position to the end of the next match of the given | ||
regular expression. | ||
""" | ||
match = regex.match(self.input.content, self.pos) | ||
if match is None: | ||
return None | ||
self.pos = match.end() | ||
return match | ||
|
||
_whitespace_regex = re.compile(r"((#[^\n]*(\n)?)|(\s+))*", re.ASCII) | ||
|
||
def _consume_whitespace(self) -> None: | ||
""" | ||
Consume whitespace and comments. | ||
""" | ||
self._consume_regex(self._whitespace_regex) | ||
|
||
def lex(self) -> ToyToken: | ||
# First, skip whitespaces | ||
self._consume_whitespace() | ||
|
||
start_pos = self.pos | ||
current_char = self._get_chars() | ||
|
||
# Handle end of file | ||
if current_char is None: | ||
return self._form_token(ToyTokenKind.EOF, start_pos) | ||
|
||
# bare identifier | ||
if current_char.isalpha() or current_char == "_": | ||
return self._lex_bare_identifier(start_pos) | ||
|
||
# single-char punctuation that are not part of a multi-char token | ||
single_char_token_kind = SINGLE_CHAR_TOKENS.get(current_char) | ||
if single_char_token_kind is not None: | ||
return self._form_token(single_char_token_kind, start_pos) | ||
|
||
if current_char.isnumeric(): | ||
return self._lex_number(start_pos) | ||
|
||
raise ParseError( | ||
Span(start_pos, start_pos + 1, self.input), | ||
f"Unexpected character: {current_char}", | ||
) | ||
|
||
IDENTIFIER_SUFFIX = r"[a-zA-Z0-9_$.]*" | ||
bare_identifier_suffix_regex = re.compile(IDENTIFIER_SUFFIX) | ||
|
||
def _lex_bare_identifier(self, start_pos: Position) -> ToyToken: | ||
""" | ||
Lex a bare identifier with the following grammar: | ||
`bare-id ::= (letter|[_]) (letter|digit|[_$.])*` | ||
The first character is expected to have already been parsed. | ||
""" | ||
self._consume_regex(self.bare_identifier_suffix_regex) | ||
|
||
return self._form_token(ToyTokenKind.IDENTIFIER, start_pos) | ||
|
||
_hexdigits_star_regex = re.compile(r"[0-9a-fA-F]*") | ||
_digits_star_regex = re.compile(r"[0-9]*") | ||
_fractional_suffix_regex = re.compile(r"\.[0-9]*([eE][+-]?[0-9]+)?") | ||
|
||
def _lex_number(self, start_pos: Position) -> ToyToken: | ||
""" | ||
Lex a number literal, which is either a decimal or an hexadecimal. | ||
The first character is expected to have already been parsed. | ||
""" | ||
first_digit = self.input.at(self.pos - 1) | ||
|
||
# Hexadecimal case, we only parse it if we see the first '0x' characters, | ||
# and then a first digit. | ||
# Otherwise, a string like '0xi32' would not be parsed correctly. | ||
if ( | ||
first_digit == "0" | ||
and self._peek_chars() == "x" | ||
and self._is_in_bounds(2) | ||
and cast(str, self.input.at(self.pos + 1)) in hexdigits | ||
): | ||
self._consume_chars(2) | ||
self._consume_regex(self._hexdigits_star_regex) | ||
return self._form_token(ToyTokenKind.NUMBER, start_pos) | ||
|
||
# Decimal case | ||
self._consume_regex(self._digits_star_regex) | ||
|
||
# Check if we are lexing a floating point | ||
match = self._consume_regex(self._fractional_suffix_regex) | ||
if match is not None: | ||
return self._form_token(ToyTokenKind.NUMBER, start_pos) | ||
return self._form_token(ToyTokenKind.NUMBER, start_pos) | ||
|
||
|
||
def tokenize(file: Path, program: str | None = None): | ||
tokens: list[Token] = [] | ||
|
||
if program is None: | ||
with open(file) as f: | ||
program = f.read() | ||
|
||
text = "" | ||
row = col = 1 | ||
|
||
def flush(): | ||
nonlocal col, row, text | ||
n = len(text) | ||
if n == 0: | ||
return | ||
|
||
true_col = col - n | ||
|
||
if text[0].isnumeric(): | ||
value = float(text) | ||
tokens.append(NumberToken(file, row, true_col, text, value)) | ||
else: | ||
tokens.append(IdentifierToken(file, row, true_col, text)) | ||
|
||
text = "" | ||
|
||
for row, line in enumerate(program.splitlines()): | ||
# 1-indexed | ||
row += 1 | ||
for col, char in enumerate(line): | ||
# 1-indexed | ||
col += 1 | ||
if char == "#": | ||
# Comment | ||
break | ||
|
||
if IDENTIFIER_CHARS.match(char): | ||
text += char | ||
continue | ||
|
||
if char == ".": | ||
# parse floating point | ||
if not text or (text[0].isnumeric() and "." not in text): | ||
# allow `.5` and `5.5` but not `5.5.5` | ||
text += char | ||
continue | ||
|
||
flush() | ||
|
||
if char == " ": | ||
continue | ||
|
||
if char in OPERATOR_CHARS: | ||
tokens.append(OperatorToken(file, row, col, char)) | ||
continue | ||
elif char in SPECIAL_CHARS: | ||
tokens.append(SpecialToken(file, row, col, char)) | ||
continue | ||
|
||
raise AssertionError(f"unhandled char {char} at ({row}, {col}) in \n{line}") | ||
toy_lexer = ToyLexer(Input(program, str(file))) | ||
|
||
col += 1 | ||
flush() | ||
tokens = [toy_lexer.lex()] | ||
|
||
tokens.append(EOFToken(file, row, col, "")) | ||
while tokens[-1].kind != ToyTokenKind.EOF: | ||
tokens.append(toy_lexer.lex()) | ||
|
||
return tokens |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,14 +1,40 @@ | ||
import re | ||
from dataclasses import dataclass | ||
from pathlib import Path | ||
|
||
from typing_extensions import Any | ||
|
||
from xdsl.utils.lexer import Token | ||
|
||
|
||
@dataclass | ||
class Location: | ||
"Structure definition a location in a file." | ||
|
||
file: Path | ||
file: str | ||
line: int | ||
col: int | ||
|
||
def __repr__(self): | ||
return f"{self.file}:{self.line}:{self.col}" | ||
|
||
|
||
_NEWLINE = re.compile(r"\n") | ||
|
||
|
||
def loc(token: Token[Any]) -> Location: | ||
file = token.span.input.name | ||
# Could be much faster | ||
|
||
remaining = token.span.start | ||
prev_end = 0 | ||
|
||
for line, newline_match in enumerate( | ||
re.finditer(_NEWLINE, token.span.input.content) | ||
): | ||
len_line = newline_match.start() - prev_end | ||
if remaining < len_line: | ||
return Location(file, line + 1, remaining + 1) | ||
remaining -= len_line + 1 | ||
prev_end = newline_match.end() | ||
|
||
raise AssertionError(f"Could not find location of token {token}") |
Oops, something went wrong.