From d53a5da0bcf7f876477681410e5d40f0238a6940 Mon Sep 17 00:00:00 2001 From: xemlock <952555+xemlock@users.noreply.github.com> Date: Tue, 4 Apr 2023 22:50:41 +0200 Subject: [PATCH] Tokenize comment contents This will allow context-dependent handling of special characters such as '%' in urls or parsing verbatim environments without resorting to search/replace preprocessing. --- library/PhpLatex/Lexer.php | 38 ++----------- library/PhpLatex/Parser.php | 54 ++++++++++--------- tests/PhpLatex/Test/LexerTest.php | 25 +++++++-- tests/PhpLatex/Test/Renderer/AbstractTest.php | 2 +- 4 files changed, 53 insertions(+), 66 deletions(-) diff --git a/library/PhpLatex/Lexer.php b/library/PhpLatex/Lexer.php index fb8e9a2..6276e67 100644 --- a/library/PhpLatex/Lexer.php +++ b/library/PhpLatex/Lexer.php @@ -8,13 +8,14 @@ class PhpLatex_Lexer const STATE_BSLASH = 1; const STATE_CONTROL = 2; const STATE_SPACE = 3; - const STATE_COMMENT = 4; const TYPE_TEXT = 'text'; const TYPE_SPACE = 'space'; const TYPE_CWORD = 'cword'; const TYPE_CSYMBOL = 'csymbol'; const TYPE_SPECIAL = 'special'; + + /** @deprecated */ const TYPE_COMMENT = 'comment'; protected $_str; @@ -96,30 +97,6 @@ public function next() $buf = ''; do { - // special handling for comments - if we're in the comment state parse everything up to first newline - // no need to match it char by char - if ($this->_state === self::STATE_COMMENT) { - // at this point $this->_pos points to first char after '%' which started the comment. - // _line and _column still point to position of '%' - // The \G assertion is true only when the current matching position is at the start - // point of the match, as specified by the offset argument. - // https://www.php.net/manual/en/regexp.reference.escape.php - preg_match('#\G(?.*)#', $this->_str, $matches, 0, $this->_pos); - - if (strlen($matches['comment'])) { - $this->_column++; // normally column would be incremented in _getChar() - $this->storeTokenPosition(); - - // adjust counters, so that call to _getChar() - $this->_pos += strlen($matches['comment']); - $this->_column += strlen($matches['comment']) - 1; - - return $this->_setToken(self::TYPE_COMMENT, $matches['comment']); - } else { - $this->_state = self::STATE_DEFAULT; - } - } - $c = $this->_getChar(); switch ($c) { @@ -246,9 +223,7 @@ public function next() $this->storeTokenPosition(); - $token = $this->_setToken(self::TYPE_SPECIAL, '%'); - $this->_state = self::STATE_COMMENT; - return $token; + return $this->_setToken(self::TYPE_SPECIAL, '%'); case self::STATE_BSLASH: return $this->_setToken(self::TYPE_CSYMBOL, '\\%'); @@ -261,13 +236,6 @@ public function next() case self::STATE_SPACE: $this->_ungetChar(); return $this->_setSpaceToken($buf); - - case self::STATE_COMMENT: - if (!strlen($buf)) { - $this->storeTokenPosition(); - } - $buf .= $c; - break; } break; diff --git a/library/PhpLatex/Parser.php b/library/PhpLatex/Parser.php index fa888cb..db033f2 100644 --- a/library/PhpLatex/Parser.php +++ b/library/PhpLatex/Parser.php @@ -276,7 +276,7 @@ protected function _parseExpr($state, $environ = null) // {{{ return $this->_parseText($token, $state); case PhpLatex_Lexer::TYPE_COMMENT: - $this->_skipSpaces(); + $this->_skipSpacesAndComments(); break; default: @@ -513,7 +513,7 @@ protected function _parseControl($token, $mode, $environ = null) // {{{ // Skip all spaces and comments occurring after this token, if this // token is a control word. if ($token['type'] === PhpLatex_Lexer::TYPE_CWORD) { - $this->_skipSpaces(); + $this->_skipSpacesAndComments(); } $mathWrapper = null; @@ -624,29 +624,32 @@ protected function _parseControl($token, $mode, $environ = null) // {{{ * * After this function has run current token, if exists, is neither space * nor comment. - * - * @return array skipped SPACE and COMMENT tokens */ - protected function _skipSpaces() + protected function _skipSpacesAndComments($inComment = false) { - $skipped = array(); while ($next = $this->_peek()) { - if ($next['type'] === PhpLatex_Lexer::TYPE_SPACE || - $next['type'] === PhpLatex_Lexer::TYPE_COMMENT || - ($next['type'] === PhpLatex_Lexer::TYPE_SPECIAL && $next['value'] === '%') - ) { - $skipped[] = $next; - $this->_next(); + if ($inComment) { + if (isset($next['raw']) && strpos($next['raw'], "\n") !== false) { + $inComment = false; + } else { + $this->_next(); + } } else { - break; + if ($next['type'] === PhpLatex_Lexer::TYPE_SPECIAL && $next['value'] === '%') { + $inComment = true; + $this->_next(); + } else if ($next['type'] === PhpLatex_Lexer::TYPE_SPACE) { + $this->_next(); + } else { + break; + } } } - return $skipped; } protected function _parseArg($mode, $environ, $parseArgs = true) // {{{ { - $this->_skipSpaces(); + $this->_skipSpacesAndComments(); if ($next = $this->_peek()) { switch ($next['type']) { @@ -761,7 +764,7 @@ protected function _parseArg($mode, $environ, $parseArgs = true) // {{{ */ protected function _parseOptArg($state, $environ) // {{{ { - $this->_skipSpaces(); + $this->_skipSpacesAndComments(); if (($next = $this->_peek()) && ($next['type'] === PhpLatex_Lexer::TYPE_SPECIAL) && @@ -789,16 +792,11 @@ protected function _parseOptArg($state, $environ) // {{{ */ protected function _parseEnvName() // {{{ { - while (false !== ($next = $this->_peek())) { - if ($next['type'] === PhpLatex_Lexer::TYPE_SPACE || - $next['type'] === PhpLatex_Lexer::TYPE_COMMENT || - ($next['type'] === PhpLatex_Lexer::TYPE_SPECIAL && $next['value'] === '%') - ) { - // 1. skip spaces and comments - $this->_next(); - continue; + // 1. Skip spaces and comments + $this->_skipSpacesAndComments(); - } elseif ($next['value'] !== '{') { + while (false !== ($next = $this->_peek())) { + if ($next['value'] !== '{') { // 2A. first encountered non-space token is not a curly bracket // Since start of group was expected, this token breaks opening // of an environment. Give it back and report failure. @@ -959,6 +957,10 @@ protected function _parseSpecial($token, $state, $environ) // {{{ $node->value = $value; return $node; + case '%': + $this->_skipSpacesAndComments(true); + break; + case '#': // currently not supported break; @@ -981,7 +983,7 @@ protected function _parseLeftRight($token, $mode, $environs) $environs = (array) $environs; - $this->_skipSpaces(); + $this->_skipSpacesAndComments(); $next = $this->_peek(); if (!$next) { return false; diff --git a/tests/PhpLatex/Test/LexerTest.php b/tests/PhpLatex/Test/LexerTest.php index bc7b12d..fe0bcdb 100644 --- a/tests/PhpLatex/Test/LexerTest.php +++ b/tests/PhpLatex/Test/LexerTest.php @@ -10,7 +10,8 @@ public function testComment() array('type' => PhpLatex_Lexer::TYPE_TEXT, 'value' => 'A', 'line' => 1, 'column' => 1), array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => ' ', 'line' => 1, 'column' => 2), array('type' => PhpLatex_Lexer::TYPE_SPECIAL, 'value' => '%', 'line' => 1, 'column' => 3), - array('type' => PhpLatex_Lexer::TYPE_COMMENT, 'value' => ' comment', 'line' => 1, 'column' => 4), + array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => ' ', 'line' => 1, 'column' => 4), + array('type' => PhpLatex_Lexer::TYPE_TEXT, 'value' => 'comment', 'line' => 1, 'column' => 5), array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => "\n", 'line' => 1, 'column' => 12), array('type' => PhpLatex_Lexer::TYPE_TEXT, 'value' => 'B', 'line' => 2, 'column' => 1), ) @@ -49,7 +50,8 @@ public function testCommentOnly() } $this->assertEquals(array( array('type' => PhpLatex_Lexer::TYPE_SPECIAL, 'value' => '%', 'line' => 1, 'column' => 1), - array('type' => PhpLatex_Lexer::TYPE_COMMENT, 'value' => ' A', 'line' => 1, 'column' => 2), + array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => ' ', 'line' => 1, 'column' => 2), + array('type' => PhpLatex_Lexer::TYPE_TEXT, 'value' => 'A', 'line' => 1, 'column' => 3), ), $tokens); } @@ -96,12 +98,27 @@ public function testTokens() array('type' => PhpLatex_Lexer::TYPE_SPECIAL, 'value' => '}', 'line' => 5, 'column' => 15), array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => ' ', 'line' => 5, 'column' => 16), array('type' => PhpLatex_Lexer::TYPE_SPECIAL, 'value' => '%', 'line' => 5, 'column' => 17), - array('type' => PhpLatex_Lexer::TYPE_COMMENT, 'value' => ' comment in math mode', 'line' => 5, 'column' => 18), + array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => ' ', 'line' => 5, 'column' => 18), + array('type' => PhpLatex_Lexer::TYPE_TEXT, 'value' => 'comment', 'line' => 5, 'column' => 19), + array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => ' ', 'line' => 5, 'column' => 26), + array('type' => PhpLatex_Lexer::TYPE_TEXT, 'value' => 'in', 'line' => 5, 'column' => 27), + array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => ' ', 'line' => 5, 'column' => 29), + array('type' => PhpLatex_Lexer::TYPE_TEXT, 'value' => 'math', 'line' => 5, 'column' => 30), + array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => ' ', 'line' => 5, 'column' => 34), + array('type' => PhpLatex_Lexer::TYPE_TEXT, 'value' => 'mode', 'line' => 5, 'column' => 35), array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => "\n", 'line' => 5, 'column' => 39), array('type' => PhpLatex_Lexer::TYPE_CSYMBOL, 'value' => '\]', 'line' => 6, 'column' => 1), array('type' => PhpLatex_Lexer::TYPE_CWORD, 'value' => '\par', 'raw' => "\n\n", 'line' => 6, 'column' => 3), array('type' => PhpLatex_Lexer::TYPE_SPECIAL, 'value' => '%', 'line' => 8, 'column' => 1), - array('type' => PhpLatex_Lexer::TYPE_COMMENT, 'value' => '% Comment in text mode', 'line' => 8, 'column' => 2), + array('type' => PhpLatex_Lexer::TYPE_SPECIAL, 'value' => '%', 'line' => 8, 'column' => 2), + array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => ' ', 'line' => 8, 'column' => 3), + array('type' => PhpLatex_Lexer::TYPE_TEXT, 'value' => 'Comment', 'line' => 8, 'column' => 4), + array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => ' ', 'line' => 8, 'column' => 11), + array('type' => PhpLatex_Lexer::TYPE_TEXT, 'value' => 'in', 'line' => 8, 'column' => 12), + array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => ' ', 'line' => 8, 'column' => 14), + array('type' => PhpLatex_Lexer::TYPE_TEXT, 'value' => 'text', 'line' => 8, 'column' => 15), + array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => ' ', 'line' => 8, 'column' => 19), + array('type' => PhpLatex_Lexer::TYPE_TEXT, 'value' => 'mode', 'line' => 8, 'column' => 20), array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => "\n", 'line' => 8, 'column' => 24), array('type' => PhpLatex_Lexer::TYPE_TEXT, 'value' => 'End.', 'line' => 9, 'column' => 1), ) diff --git a/tests/PhpLatex/Test/Renderer/AbstractTest.php b/tests/PhpLatex/Test/Renderer/AbstractTest.php index 853db78..a0f68dc 100644 --- a/tests/PhpLatex/Test/Renderer/AbstractTest.php +++ b/tests/PhpLatex/Test/Renderer/AbstractTest.php @@ -41,7 +41,7 @@ public function testIssue6() \eta_{12} \\\\ \eta_{21} \\\\ \eta_2 - \end{array} + \end{array} % comment is here \]'; $parser = new PhpLatex_Parser();