-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlexer.py
96 lines (86 loc) · 2.62 KB
/
lexer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
from rply import LexerGenerator
lg = LexerGenerator()
# build up a set of token names and regexes they match
lg.add('FLOAT', '-?\d+\.\d+')
lg.add('INTEGER', '-?\d+')
lg.add('STRING', '(""".*?""")|(".*?")|(\'.*?\')')
#lg.add('PRINT', 'print(?!\w)') # put this before variable which would otherwise match
lg.add('BOOLEAN', "true(?!\w)|false(?!\w)")
lg.add('IF', 'if(?!\w)')
lg.add('ELSE', 'else(?!\w)')
lg.add('END', 'end(?!\w)')
lg.add('AND', "and(?!\w)")
lg.add('OR', "or(?!\w)")
lg.add('NOT', "not(?!\w)")
lg.add('LET', 'let(?!\w)')
lg.add('FOR', 'for(?!\w)')
lg.add('WHILE', 'while(?!\w)')
lg.add('BREAK', 'break(?!\w)')
lg.add('CONTINUE', 'continue(?!\w)')
lg.add('MATCH', 'match(?!\w)')
lg.add('ENUM', 'enum(?!\w)')
lg.add('NEW', 'new(?!\w)')
lg.add('RETURN', 'return(?!\w)')
lg.add('TYPE', 'type(?!\w)')
lg.add('TYPE_ARRAY', 'array(?!\w)')
lg.add('TYPE_DICT', 'dict(?!\w)')
lg.add('TYPE_INTEGER', 'int(?!\w)')
lg.add('TYPE_STRING', 'str(?!\w)')
lg.add('TYPE_FLOAT', 'float(?!\w)')
lg.add('TYPE_CHAR', 'char(?!\w)')
lg.add('TYPE_LONG', 'long(?!\w)')
lg.add('TYPE_DOUBLE', 'double(?!\w)')
lg.add('RECORD', 'record(?!\w)')
lg.add('FUNCTION', 'func(?!\w)')
lg.add('LAMBDA', 'fn(?!\w)')
lg.add('PRIVATE', 'priv(?!\w)')
lg.add('MODULE', 'mod(?!\w)')
lg.add('TRAIT', 'trait(?!\w)')
lg.add('IMPLEMENT', 'impl(?!\w)')
lg.add('IMPORT', 'import(?!\w)')
lg.add('SEND', 'send(?!\w)')
lg.add('RECEIVE', 'receive(?!\w)')
lg.add('IDENTIFIER', "[a-zA-Z_][a-zA-Z0-9_]*")
lg.add('PLUS', '\+')
lg.add('==', '==')
lg.add('!=', '!=')
lg.add('>=', '>=')
lg.add('<=', '<=')
lg.add('>', '>')
lg.add('<', '<')
lg.add('=', '=')
lg.add('[', '\[')
lg.add(']', '\]')
lg.add('{', '\{')
lg.add('}', '\}')
lg.add('|', '\|')
lg.add(',', ',')
lg.add('DOT', '\.')
lg.add('COLON', ':')
lg.add('MINUS', '-')
lg.add('MUL', '\*')
lg.add('DIV', '/')
lg.add('MOD', '%')
lg.add('(', '\(')
lg.add(')', '\)')
lg.add("NEWLINE", r"\n+ *\n*")
# ignore whitespace
lg.ignore('[ \t\r\f\v]+')
lexer = lg.build()
def lex(source):
comments = r'(#.*)(?:\n|\Z)'
multiline = r'([\s]+)(?:\n)'
comment = re.search(comments,source)
while comment is not None:
start, end = comment.span(1)
assert start >= 0 and end >= 0
source = source[0:start] + source[end:] #remove string part that was a comment
comment = re.search(comments,source)
line = re.search(multiline,source)
while line is not None:
start, end = line.span(1)
assert start >= 0 and end >= 0
source = source[0:start] + source[end:] #remove string part that was an empty line
line = re.search(multiline,source)
#print "source is now: %s" % source
return lexer.lex(source)