diff --git a/lazylex/html.py b/lazylex/html.py
index 1107a0dfc..d41663bac 100755
--- a/lazylex/html.py
+++ b/lazylex/html.py
@@ -81,7 +81,7 @@ def Print(self, s):
# HTML Tokens
# CommentBegin and ProcessingBegin are "pseudo-tokens", not visible
-TOKENS = 'Decl Comment CommentBegin Processing ProcessingBegin StartTag StartEndTag EndTag DecChar HexChar CharEntity RawData Invalid EndOfStream'.split(
+TOKENS = 'Decl Comment CommentBegin Processing ProcessingBegin StartTag StartEndTag EndTag DecChar HexChar CharEntity RawData CData CDataStartTag CDataEndTag Invalid EndOfStream'.split(
)
@@ -168,6 +168,7 @@ def MakeLexer(rules):
# NOTE: < is allowed in these.
(r']+ >', Tok.Decl), #
+ (r'<(?:script|style) [^>]+>', Tok.CDataStartTag), # start
(r' [^>]+ >', Tok.EndTag), # self-closing
comes FIRST
(r'< [^>]+ />', Tok.StartEndTag), # end
(r'< [^>]+ >', Tok.StartTag), # start
@@ -175,7 +176,9 @@ def MakeLexer(rules):
(r'&\# x[0-9a-fA-F]+ ;', Tok.HexChar),
(r'& [a-zA-Z]+ ;', Tok.CharEntity),
- # Note: > is allowed in raw data.
+ # HTML5 allows > in raw data - should we? It's apparently not allowed in
+ # XML.
+ # But < is not allowed.
# https://stackoverflow.com/questions/10462348/right-angle-bracket-in-html
(r'[^&<]+', Tok.RawData),
(r'.', Tok.Invalid), # error!
@@ -225,6 +228,15 @@ def _Peek(self):
raise LexError(self.s, self.pos)
return Tok.Processing, pos + 2 # ?>
+ # TODO: we need to enter state so the NEXT call can be CData
+ # And then the one after that must be CDataEndTag.
+ if tok_id == Tok.CDataStartTag:
+ end_tag = ''
+ pos = self.s.find(end_tag, self.pos)
+ if pos == -1:
+ # unterminated
+ raise LexError(self.s, self.pos)
+
return tok_id, m.end()
else:
raise AssertionError('Tok.Invalid rule should have matched')
diff --git a/lazylex/html_test.py b/lazylex/html_test.py
index ed67a1c10..f27a392cd 100755
--- a/lazylex/html_test.py
+++ b/lazylex/html_test.py
@@ -165,6 +165,39 @@ def testProcessingInstruction(self):
log('tok %r', html.TokenName(tok_id))
self.assertEqual(Tok.EndOfStream, tok_id)
+ def testScriptStyle(self):
+
+ Tok = html.Tok
+ h = '''
+ hi
+ '''
+ print(repr(h))
+ lex = html.ValidTokens(h)
+
+ tok_id, pos = next(lex)
+ self.assertEqual(12, pos)
+ self.assertEqual(Tok.RawData, tok_id)
+
+ #
+ tok_id, pos = next(lex)
+ self.assertEqual(27, pos)
+ log('tok %r', html.TokenName(tok_id))
+ self.assertEqual(Tok.CDataEndTag, tok_id)
+
def testValid(self):
Tok = html.Tok
@@ -205,38 +238,25 @@ def testValid(self):
def testInvalid(self):
Tok = html.Tok
- lex = html.ValidTokens('&')
-
- tok_id, pos = next(lex)
- self.assertEqual(3, pos)
- self.assertEqual(Tok.StartTag, tok_id)
-
- try:
- tok_id, pos = next(lex)
- except html.LexError as e:
- print(e)
- else:
- self.fail('Expected LexError')
-
- # Comment
- lex = html.ValidTokens('