Skip to content

Commit

Permalink
documentation: use generic lexer infrastructure in Toy parser
Browse files Browse the repository at this point in the history
  • Loading branch information
superlopuh committed Dec 8, 2024
1 parent 2ef383b commit adc0fa4
Show file tree
Hide file tree
Showing 4 changed files with 251 additions and 198 deletions.
292 changes: 181 additions & 111 deletions docs/Toy/toy/frontend/lexer.py
Original file line number Diff line number Diff line change
@@ -1,126 +1,196 @@
import re
from dataclasses import dataclass
from enum import Enum, auto
from pathlib import Path

from .location import Location


@dataclass(init=False)
class Token:
file: Path
line: int
col: int
text: str

def __init__(self, file: Path, line: int, col: int, text: str):
self.file = file
self.line = line
self.col = col
self.text = text

@property
def loc(self):
return Location(self.file, self.line, self.col)

@classmethod
def name(cls):
return cls.__name__


@dataclass
class IdentifierToken(Token):
pass


@dataclass
class NumberToken(Token):
value: float


@dataclass
class OperatorToken(Token):
pass


@dataclass
class SpecialToken(Token):
pass


@dataclass
class EOFToken(Token):
pass

from string import hexdigits
from typing import TypeAlias, cast

from xdsl.utils.exceptions import ParseError
from xdsl.utils.lexer import Input, Lexer, Position, Span, Token


class ToyTokenKind(Enum):
SEMICOLON = auto()
PARENTHESE_OPEN = auto()
PARENTHESE_CLOSE = auto()
BRACKET_OPEN = auto()
BRACKET_CLOSE = auto()
SBRACKET_OPEN = auto()
SBRACKET_CLOSE = auto()
LT = auto()
GT = auto()
EQ = auto()
COMMA = auto()

EOF = auto()

RETURN = auto()
VAR = auto()
DEF = auto()

IDENTIFIER = auto()
NUMBER = auto()
OPERATOR = auto()


SINGLE_CHAR_TOKENS = {
";": ToyTokenKind.SEMICOLON,
"(": ToyTokenKind.PARENTHESE_OPEN,
")": ToyTokenKind.PARENTHESE_CLOSE,
"{": ToyTokenKind.BRACKET_OPEN,
"}": ToyTokenKind.BRACKET_CLOSE,
"[": ToyTokenKind.SBRACKET_OPEN,
"]": ToyTokenKind.SBRACKET_CLOSE,
"<": ToyTokenKind.LT,
">": ToyTokenKind.GT,
"=": ToyTokenKind.EQ,
",": ToyTokenKind.COMMA,
"+": ToyTokenKind.OPERATOR,
"-": ToyTokenKind.OPERATOR,
"*": ToyTokenKind.OPERATOR,
"/": ToyTokenKind.OPERATOR,
}

IDENTIFIER_CHARS = re.compile(r"[\w]|[\d]|_")
OPERATOR_CHARS = set("+-*/")
SPECIAL_CHARS = set("<>}{(),;=[]")
SPECIAL_CHARS = set(",")


ToyToken: TypeAlias = Token[ToyTokenKind]


class ToyLexer(Lexer[ToyTokenKind]):
def _is_in_bounds(self, size: Position = 1) -> bool:
"""
Check if the current position is within the bounds of the input.
"""
return self.pos + size - 1 < self.input.len

def _get_chars(self, size: int = 1) -> str | None:
"""
Get the character at the current location, or multiple characters ahead.
Return None if the position is out of bounds.
"""
res = self.input.slice(self.pos, self.pos + size)
self.pos += size
return res

def _peek_chars(self, size: int = 1) -> str | None:
"""
Peek at the character at the current location, or multiple characters ahead.
Return None if the position is out of bounds.
"""
return self.input.slice(self.pos, self.pos + size)

def _consume_chars(self, size: int = 1) -> None:
"""
Advance the lexer position in the input by the given amount.
"""
self.pos += size

def _consume_regex(self, regex: re.Pattern[str]) -> re.Match[str] | None:
"""
Advance the lexer position to the end of the next match of the given
regular expression.
"""
match = regex.match(self.input.content, self.pos)
if match is None:
return None
self.pos = match.end()
return match

_whitespace_regex = re.compile(r"((#[^\n]*(\n)?)|(\s+))*", re.ASCII)

def _consume_whitespace(self) -> None:
"""
Consume whitespace and comments.
"""
self._consume_regex(self._whitespace_regex)

def lex(self) -> ToyToken:
# First, skip whitespaces
self._consume_whitespace()

start_pos = self.pos
current_char = self._get_chars()

# Handle end of file
if current_char is None:
return self._form_token(ToyTokenKind.EOF, start_pos)

# bare identifier
if current_char.isalpha() or current_char == "_":
return self._lex_bare_identifier(start_pos)

# single-char punctuation that are not part of a multi-char token
single_char_token_kind = SINGLE_CHAR_TOKENS.get(current_char)
if single_char_token_kind is not None:
return self._form_token(single_char_token_kind, start_pos)

if current_char.isnumeric():
return self._lex_number(start_pos)

raise ParseError(
Span(start_pos, start_pos + 1, self.input),
f"Unexpected character: {current_char}",
)

IDENTIFIER_SUFFIX = r"[a-zA-Z0-9_$.]*"
bare_identifier_suffix_regex = re.compile(IDENTIFIER_SUFFIX)

def _lex_bare_identifier(self, start_pos: Position) -> ToyToken:
"""
Lex a bare identifier with the following grammar:
`bare-id ::= (letter|[_]) (letter|digit|[_$.])*`
The first character is expected to have already been parsed.
"""
self._consume_regex(self.bare_identifier_suffix_regex)

return self._form_token(ToyTokenKind.IDENTIFIER, start_pos)

_hexdigits_star_regex = re.compile(r"[0-9a-fA-F]*")
_digits_star_regex = re.compile(r"[0-9]*")
_fractional_suffix_regex = re.compile(r"\.[0-9]*([eE][+-]?[0-9]+)?")

def _lex_number(self, start_pos: Position) -> ToyToken:
"""
Lex a number literal, which is either a decimal or an hexadecimal.
The first character is expected to have already been parsed.
"""
first_digit = self.input.at(self.pos - 1)

# Hexadecimal case, we only parse it if we see the first '0x' characters,
# and then a first digit.
# Otherwise, a string like '0xi32' would not be parsed correctly.
if (
first_digit == "0"
and self._peek_chars() == "x"
and self._is_in_bounds(2)
and cast(str, self.input.at(self.pos + 1)) in hexdigits
):
self._consume_chars(2)
self._consume_regex(self._hexdigits_star_regex)
return self._form_token(ToyTokenKind.NUMBER, start_pos)

# Decimal case
self._consume_regex(self._digits_star_regex)

# Check if we are lexing a floating point
match = self._consume_regex(self._fractional_suffix_regex)
if match is not None:
return self._form_token(ToyTokenKind.NUMBER, start_pos)
return self._form_token(ToyTokenKind.NUMBER, start_pos)


def tokenize(file: Path, program: str | None = None):
tokens: list[Token] = []

if program is None:
with open(file) as f:
program = f.read()

text = ""
row = col = 1

def flush():
nonlocal col, row, text
n = len(text)
if n == 0:
return

true_col = col - n

if text[0].isnumeric():
value = float(text)
tokens.append(NumberToken(file, row, true_col, text, value))
else:
tokens.append(IdentifierToken(file, row, true_col, text))

text = ""

for row, line in enumerate(program.splitlines()):
# 1-indexed
row += 1
for col, char in enumerate(line):
# 1-indexed
col += 1
if char == "#":
# Comment
break

if IDENTIFIER_CHARS.match(char):
text += char
continue

if char == ".":
# parse floating point
if not text or (text[0].isnumeric() and "." not in text):
# allow `.5` and `5.5` but not `5.5.5`
text += char
continue

flush()

if char == " ":
continue

if char in OPERATOR_CHARS:
tokens.append(OperatorToken(file, row, col, char))
continue
elif char in SPECIAL_CHARS:
tokens.append(SpecialToken(file, row, col, char))
continue

raise AssertionError(f"unhandled char {char} at ({row}, {col}) in \n{line}")
toy_lexer = ToyLexer(Input(program, str(file)))

col += 1
flush()
tokens = [toy_lexer.lex()]

tokens.append(EOFToken(file, row, col, ""))
while tokens[-1].kind != ToyTokenKind.EOF:
tokens.append(toy_lexer.lex())

return tokens
30 changes: 28 additions & 2 deletions docs/Toy/toy/frontend/location.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,40 @@
import re
from dataclasses import dataclass
from pathlib import Path

from typing_extensions import Any

from xdsl.utils.lexer import Token


@dataclass
class Location:
"Structure definition a location in a file."

file: Path
file: str
line: int
col: int

def __repr__(self):
return f"{self.file}:{self.line}:{self.col}"


_NEWLINE = re.compile(r"\n")


def loc(token: Token[Any]) -> Location:
file = token.span.input.name
# Could be much faster

remaining = token.span.start
prev_end = 0

for line, newline_match in enumerate(
re.finditer(_NEWLINE, token.span.input.content)
):
len_line = newline_match.start() - prev_end
if remaining < len_line:
return Location(file, line + 1, remaining + 1)
remaining -= len_line + 1
prev_end = newline_match.end()

raise AssertionError(f"Could not find location of token {token}")
Loading

0 comments on commit adc0fa4

Please sign in to comment.