From c304f006bcce167ad311dbd3728b361cb765926b Mon Sep 17 00:00:00 2001 From: Tommy Yu Date: Fri, 26 Jul 2019 15:22:10 -0700 Subject: [PATCH] Ensure errors in template literals also dealt with --- src/calmjs/parse/lexers/es2015.py | 82 +++++++++++++++++++++ src/calmjs/parse/tests/lexer.py | 46 ++++++++++++ src/calmjs/parse/tests/test_es2015_lexer.py | 9 ++- 3 files changed, 136 insertions(+), 1 deletion(-) diff --git a/src/calmjs/parse/lexers/es2015.py b/src/calmjs/parse/lexers/es2015.py index 84038a4..0c7a491 100644 --- a/src/calmjs/parse/lexers/es2015.py +++ b/src/calmjs/parse/lexers/es2015.py @@ -5,7 +5,10 @@ import re import ply +from itertools import chain +from calmjs.parse.utils import repr_compat +from calmjs.parse.exceptions import ECMASyntaxError from calmjs.parse.lexers.es5 import Lexer as ES5Lexer template_token_types = ( @@ -26,12 +29,73 @@ 'YIELD', ) +PATT_BROKEN_TEMPLATE = re.compile(r""" +(?:(?:`|}) # opening ` or } + (?: [^`\\] # not `, \; allow + | \\(\n|\r(?!\n)|\u2028|\u2029|\r\n) # line continuation + | \\[a-tvwyzA-TVWYZ!-\/:-@\[-`{-~] # escaped chars + | \\x[0-9a-fA-F]{2} # hex_escape_sequence + | \\u[0-9a-fA-F]{4} # unicode_escape_sequence + | \\(?:[1-7][0-7]{0,2}|[0-7]{2,3}) # octal_escape_sequence + | \\0 # (ECMA-262 6.0 21.2.2.11) + )* # zero or many times +) # omit closing ` or ${ +""", flags=re.VERBOSE) + + +def broken_template_token_handler(lexer, token): + match = PATT_BROKEN_TEMPLATE.match(token.value) + if not match: + return + + # update the error token value to only include what was matched here + # as this will be the actual token that "failed" + token.value = match.group() + # calculate colno for current token colno before... + colno = lexer._get_colno(token) + # updating the newline indexes for the error reporting for raw + # lexpos + lexer._update_newline_idx(token) + # probe for the next values (which no valid rules will match) + position = lexer.lexer.lexpos + len(token.value) + failure = lexer.lexer.lexdata[position:position + 2] + if failure and failure[0] == '\\': + type_ = {'x': 'hexadecimal', 'u': 'unicode'}[failure[1]] + seq = re.match( + r'\\[xu][0-9-a-f-A-F]*', lexer.lexer.lexdata[position:] + ).group() + raise ECMASyntaxError( + "Invalid %s escape sequence '%s' at %s:%s" % ( + type_, seq, lexer.lineno, + lexer._get_colno_lexpos(position) + ) + ) + tl = 16 # truncate length + + if lexer.current_template_tokens: + # join all tokens together + tmpl = '...'.join( + t.value for t in chain(lexer.current_template_tokens, [token])) + lineno = lexer.current_template_tokens[0].lineno + colno = lexer.current_template_tokens[0].colno + else: + tmpl = token.value + lineno = token.lineno + + raise ECMASyntaxError('Unterminated template literal %s at %s:%s' % ( + repr_compat(tmpl[:tl].strip() + (tmpl[tl:] and '...')), lineno, colno)) + class Lexer(ES5Lexer): """ ES2015 lexer. """ + def __init__(self): + super(Lexer, self).__init__() + self.error_token_handlers.append(broken_template_token_handler) + self.current_template_tokens = [] + # Punctuators (ES6) # t_DOLLAR_LBRACE = r'${' # this is also a right brace punctuator... @@ -84,9 +148,27 @@ class Lexer(ES5Lexer): (?:`|\${)) # closing ` or ${ """ + RBRACE = r'}' + @ply.lex.TOKEN(template) def t_TEMPLATE_RAW(self, token): for patt, token_type in template_token_types: if patt.match(token.value): token.type = token_type + break + if token.type == 'TEMPLATE_HEAD': + self.current_template_tokens = [token] + elif token.type == 'TEMPLATE_MIDDLE': + self.current_template_tokens.append(token) + else: + self.current_template_tokens = [] + + return token + + @ply.lex.TOKEN(RBRACE) + def t_RBRACE(self, token): + if self.current_template_tokens: + self.lexer.lexpos = self.lexer.lexpos - 1 + token.value = self.lexer.lexdata[self.lexer.lexpos:] + broken_template_token_handler(self, token) return token diff --git a/src/calmjs/parse/tests/lexer.py b/src/calmjs/parse/tests/lexer.py index 6308841..4880ef8 100644 --- a/src/calmjs/parse/tests/lexer.py +++ b/src/calmjs/parse/tests/lexer.py @@ -632,6 +632,52 @@ ) ] +# various template related syntax errors +es2015_error_cases_tmpl = [ + ( + 'unterminated_template_eof', + "var foo = `test", + "Unterminated template literal '`test' at 1:11", + ), ( + 'unterminated_template_middle_eof', + "var foo = `${foo}bar${baz}fail", + # the specific identifiers are not tracked, thus ... + "Unterminated template literal '`${...}bar${...}...' at 1:11", + ), ( + 'invalid_hex_sequence', + "var foo = `fail\\x1`", + # backticks are converted to single quotes + "Invalid hexadecimal escape sequence '\\x1' at 1:16", + ), ( + 'invalid_unicode_sequence', + "var foo = `fail\\u12`", + "Invalid unicode escape sequence '\\u12' at 1:16", + ), ( + 'invalid_hex_sequence_multiline', + "var foo = `foobar\r\nfail\\x1`", + # backticks are converted to single quotes + "Invalid hexadecimal escape sequence '\\x1' at 2:5", + ), ( + 'invalid_unicode_sequence_multiline', + "var foo = `foobar\nfail\\u12`", + "Invalid unicode escape sequence '\\u12' at 2:5", + ), ( + 'invalid_hex_sequence_middle', + "var foo = `fail${wat}blah\\x1`", + # backticks are converted to single quotes + "Invalid hexadecimal escape sequence '\\x1' at 1:26", + ), ( + 'invalid_hex_sequence_middle_multiline', + "var foo = `foobar${lolwat}\r\nfailure${failure}wat\r\nwat\\x1`", + # backticks are converted to single quotes + "Invalid hexadecimal escape sequence '\\x1' at 3:4", + ), ( + 'long_invalid_template_truncated', + "var foo = `1234567890abcdetruncated", + "Unterminated template literal '`1234567890abcde...' at 1:11", + ) +] + def run_lexer(value, lexer_cls): lexer = lexer_cls() diff --git a/src/calmjs/parse/tests/test_es2015_lexer.py b/src/calmjs/parse/tests/test_es2015_lexer.py index 4f35757..fae02cc 100644 --- a/src/calmjs/parse/tests/test_es2015_lexer.py +++ b/src/calmjs/parse/tests/test_es2015_lexer.py @@ -6,6 +6,7 @@ from calmjs.parse.exceptions import ECMASyntaxError from calmjs.parse.testing.util import build_equality_testcase +from calmjs.parse.testing.util import build_exception_testcase from calmjs.parse.tests.lexer import ( run_lexer, run_lexer_pos, @@ -13,6 +14,7 @@ es5_pos_cases, es2015_cases, es2015_pos_cases, + es2015_error_cases_tmpl, ) @@ -23,7 +25,8 @@ def test_initial_template_character(self): lexer.input('`') with self.assertRaises(ECMASyntaxError) as e: [token for token in lexer] - self.assertEqual(str(e.exception), "Illegal character '`' at 1:1") + self.assertEqual( + str(e.exception), "Unterminated template literal '`' at 1:1") LexerKeywordTestCase = build_equality_testcase( @@ -54,3 +57,7 @@ def test_initial_template_character(self): LexerES2015PosTestCase = build_equality_testcase( 'LexerES2015PosTestCase', partial( run_lexer_pos, lexer_cls=Lexer), es2015_pos_cases) + +LexerES2015ErrorTemplateTestCase = build_exception_testcase( + 'LexerES2015ErrorTemplateTestCase', partial( + run_lexer, lexer_cls=Lexer), es2015_error_cases_tmpl, ECMASyntaxError)