diff --git a/lazylex/html.py b/lazylex/html.py index 1107a0dfc..d41663bac 100755 --- a/lazylex/html.py +++ b/lazylex/html.py @@ -81,7 +81,7 @@ def Print(self, s): # HTML Tokens # CommentBegin and ProcessingBegin are "pseudo-tokens", not visible -TOKENS = 'Decl Comment CommentBegin Processing ProcessingBegin StartTag StartEndTag EndTag DecChar HexChar CharEntity RawData Invalid EndOfStream'.split( +TOKENS = 'Decl Comment CommentBegin Processing ProcessingBegin StartTag StartEndTag EndTag DecChar HexChar CharEntity RawData CData CDataStartTag CDataEndTag Invalid EndOfStream'.split( ) @@ -168,6 +168,7 @@ def MakeLexer(rules): # NOTE: < is allowed in these. (r']+ >', Tok.Decl), # + (r'<(?:script|style) [^>]+>', Tok.CDataStartTag), # start (r']+ >', Tok.EndTag), # self-closing
comes FIRST (r'< [^>]+ />', Tok.StartEndTag), # end
(r'< [^>]+ >', Tok.StartTag), # start @@ -175,7 +176,9 @@ def MakeLexer(rules): (r'&\# x[0-9a-fA-F]+ ;', Tok.HexChar), (r'& [a-zA-Z]+ ;', Tok.CharEntity), - # Note: > is allowed in raw data. + # HTML5 allows > in raw data - should we? It's apparently not allowed in + # XML. + # But < is not allowed. # https://stackoverflow.com/questions/10462348/right-angle-bracket-in-html (r'[^&<]+', Tok.RawData), (r'.', Tok.Invalid), # error! @@ -225,6 +228,15 @@ def _Peek(self): raise LexError(self.s, self.pos) return Tok.Processing, pos + 2 # ?> + # TODO: we need to enter state so the NEXT call can be CData + # And then the one after that must be CDataEndTag. + if tok_id == Tok.CDataStartTag: + end_tag = '' + pos = self.s.find(end_tag, self.pos) + if pos == -1: + # unterminated + raise LexError(self.s, self.pos) + return tok_id, m.end() else: raise AssertionError('Tok.Invalid rule should have matched') diff --git a/lazylex/html_test.py b/lazylex/html_test.py index ed67a1c10..f27a392cd 100755 --- a/lazylex/html_test.py +++ b/lazylex/html_test.py @@ -165,6 +165,39 @@ def testProcessingInstruction(self): log('tok %r', html.TokenName(tok_id)) self.assertEqual(Tok.EndOfStream, tok_id) + def testScriptStyle(self): + + Tok = html.Tok + h = ''' + hi + ''' + print(repr(h)) + lex = html.ValidTokens(h) + + tok_id, pos = next(lex) + self.assertEqual(12, pos) + self.assertEqual(Tok.RawData, tok_id) + + # + tok_id, pos = next(lex) + self.assertEqual(27, pos) + log('tok %r', html.TokenName(tok_id)) + self.assertEqual(Tok.CDataEndTag, tok_id) + def testValid(self): Tok = html.Tok @@ -205,38 +238,25 @@ def testValid(self): def testInvalid(self): Tok = html.Tok - lex = html.ValidTokens('&') - - tok_id, pos = next(lex) - self.assertEqual(3, pos) - self.assertEqual(Tok.StartTag, tok_id) - - try: - tok_id, pos = next(lex) - except html.LexError as e: - print(e) - else: - self.fail('Expected LexError') - - # Comment - lex = html.ValidTokens('