From d53a5da0bcf7f876477681410e5d40f0238a6940 Mon Sep 17 00:00:00 2001
From: xemlock <952555+xemlock@users.noreply.github.com>
Date: Tue, 4 Apr 2023 22:50:41 +0200
Subject: [PATCH] Tokenize comment contents

This will allow context-dependent handling of special characters such as '%' in urls
or parsing verbatim environments without resorting to search/replace preprocessing.
---
 library/PhpLatex/Lexer.php                    | 38 ++-----------
 library/PhpLatex/Parser.php                   | 54 ++++++++++---------
 tests/PhpLatex/Test/LexerTest.php             | 25 +++++++--
 tests/PhpLatex/Test/Renderer/AbstractTest.php |  2 +-
 4 files changed, 53 insertions(+), 66 deletions(-)
diff --git a/library/PhpLatex/Lexer.php b/library/PhpLatex/Lexer.php
index fb8e9a2..6276e67 100644
--- a/library/PhpLatex/Lexer.php
+++ b/library/PhpLatex/Lexer.php
@@ -8,13 +8,14 @@ class PhpLatex_Lexer
     const STATE_BSLASH  = 1;
     const STATE_CONTROL = 2;
     const STATE_SPACE   = 3;
-    const STATE_COMMENT = 4;
 
     const TYPE_TEXT     = 'text';
     const TYPE_SPACE    = 'space';
     const TYPE_CWORD    = 'cword';
     const TYPE_CSYMBOL  = 'csymbol';
     const TYPE_SPECIAL  = 'special';
+
+    /** @deprecated  */
     const TYPE_COMMENT  = 'comment';
 
     protected $_str;
@@ -96,30 +97,6 @@ public function next()
         $buf = '';
 
         do {
-            // special handling for comments - if we're in the comment state parse everything up to first newline
-            // no need to match it char by char
-            if ($this->_state === self::STATE_COMMENT) {
-                // at this point $this->_pos points to first char after '%' which started the comment.
-                // _line and _column still point to position of '%'
-                // The \G assertion is true only when the current matching position is at the start
-                // point of the match, as specified by the offset argument.
-                // https://www.php.net/manual/en/regexp.reference.escape.php
-                preg_match('#\G(?<comment>.*)#', $this->_str, $matches, 0, $this->_pos);
-
-                if (strlen($matches['comment'])) {
-                    $this->_column++; // normally column would be incremented in _getChar()
-                    $this->storeTokenPosition();
-
-                    // adjust counters, so that call to _getChar()
-                    $this->_pos += strlen($matches['comment']);
-                    $this->_column += strlen($matches['comment']) - 1;
-
-                    return $this->_setToken(self::TYPE_COMMENT, $matches['comment']);
-                } else {
-                    $this->_state = self::STATE_DEFAULT;
-                }
-            }
-
             $c = $this->_getChar();
 
             switch ($c) {
@@ -246,9 +223,7 @@ public function next()
 
                             $this->storeTokenPosition();
 
-                            $token = $this->_setToken(self::TYPE_SPECIAL, '%');
-                            $this->_state = self::STATE_COMMENT;
-                            return $token;
+                            return $this->_setToken(self::TYPE_SPECIAL, '%');
 
                         case self::STATE_BSLASH:
                             return $this->_setToken(self::TYPE_CSYMBOL, '\\%');
@@ -261,13 +236,6 @@ public function next()
                         case self::STATE_SPACE:
                             $this->_ungetChar();
                             return $this->_setSpaceToken($buf);
-
-                        case self::STATE_COMMENT:
-                            if (!strlen($buf)) {
-                                $this->storeTokenPosition();
-                            }
-                            $buf .= $c;
-                            break;
                     }
                     break;
 
diff --git a/library/PhpLatex/Parser.php b/library/PhpLatex/Parser.php
index fa888cb..db033f2 100644
--- a/library/PhpLatex/Parser.php
+++ b/library/PhpLatex/Parser.php
@@ -276,7 +276,7 @@ protected function _parseExpr($state, $environ = null) // {{{
                     return $this->_parseText($token, $state);
 
                 case PhpLatex_Lexer::TYPE_COMMENT:
-                    $this->_skipSpaces();
+                    $this->_skipSpacesAndComments();
                     break;
 
                 default:
@@ -513,7 +513,7 @@ protected function _parseControl($token, $mode, $environ = null) // {{{
         // Skip all spaces and comments occurring after this token, if this
         // token is a control word.
         if ($token['type'] === PhpLatex_Lexer::TYPE_CWORD) {
-            $this->_skipSpaces();
+            $this->_skipSpacesAndComments();
         }
 
         $mathWrapper = null;
@@ -624,29 +624,32 @@ protected function _parseControl($token, $mode, $environ = null) // {{{
      *
      * After this function has run current token, if exists, is neither space
      * nor comment.
-     *
-     * @return array skipped SPACE and COMMENT tokens
      */
-    protected function _skipSpaces()
+    protected function _skipSpacesAndComments($inComment = false)
     {
-        $skipped = array();
         while ($next = $this->_peek()) {
-            if ($next['type'] === PhpLatex_Lexer::TYPE_SPACE ||
-                $next['type'] === PhpLatex_Lexer::TYPE_COMMENT ||
-                ($next['type'] === PhpLatex_Lexer::TYPE_SPECIAL && $next['value'] === '%')
-            ) {
-                $skipped[] = $next;
-                $this->_next();
+            if ($inComment) {
+                if (isset($next['raw']) && strpos($next['raw'], "\n") !== false) {
+                    $inComment = false;
+                } else {
+                    $this->_next();
+                }
             } else {
-                break;
+                if ($next['type'] === PhpLatex_Lexer::TYPE_SPECIAL && $next['value'] === '%') {
+                    $inComment = true;
+                    $this->_next();
+                } else if ($next['type'] === PhpLatex_Lexer::TYPE_SPACE) {
+                    $this->_next();
+                } else {
+                    break;
+                }
             }
         }
-        return $skipped;
     }
 
     protected function _parseArg($mode, $environ, $parseArgs = true) // {{{
     {
-        $this->_skipSpaces();
+        $this->_skipSpacesAndComments();
 
         if ($next = $this->_peek()) {
             switch ($next['type']) {
@@ -761,7 +764,7 @@ protected function _parseArg($mode, $environ, $parseArgs = true) // {{{
      */
     protected function _parseOptArg($state, $environ) // {{{
     {
-        $this->_skipSpaces();
+        $this->_skipSpacesAndComments();
 
         if (($next = $this->_peek()) &&
             ($next['type'] === PhpLatex_Lexer::TYPE_SPECIAL) &&
@@ -789,16 +792,11 @@ protected function _parseOptArg($state, $environ) // {{{
      */
     protected function _parseEnvName() // {{{
     {
-        while (false !== ($next = $this->_peek())) {
-            if ($next['type'] === PhpLatex_Lexer::TYPE_SPACE ||
-                $next['type'] === PhpLatex_Lexer::TYPE_COMMENT ||
-                ($next['type'] === PhpLatex_Lexer::TYPE_SPECIAL && $next['value'] === '%')
-            ) {
-                // 1. skip spaces and comments
-                $this->_next();
-                continue;
+        // 1. Skip spaces and comments
+        $this->_skipSpacesAndComments();
 
-            } elseif ($next['value'] !== '{') {
+        while (false !== ($next = $this->_peek())) {
+            if ($next['value'] !== '{') {
                 // 2A. first encountered non-space token is not a curly bracket
                 // Since start of group was expected, this token breaks opening
                 // of an environment. Give it back and report failure.
@@ -959,6 +957,10 @@ protected function _parseSpecial($token, $state, $environ) // {{{
                 $node->value = $value;
                 return $node;
 
+            case '%':
+                $this->_skipSpacesAndComments(true);
+                break;
+
             case '#':
                 // currently not supported
                 break;
@@ -981,7 +983,7 @@ protected function _parseLeftRight($token, $mode, $environs)
 
         $environs = (array) $environs;
 
-        $this->_skipSpaces();
+        $this->_skipSpacesAndComments();
         $next = $this->_peek();
         if (!$next) {
             return false;
diff --git a/tests/PhpLatex/Test/LexerTest.php b/tests/PhpLatex/Test/LexerTest.php
index bc7b12d..fe0bcdb 100644
--- a/tests/PhpLatex/Test/LexerTest.php
+++ b/tests/PhpLatex/Test/LexerTest.php
@@ -10,7 +10,8 @@ public function testComment()
                 array('type' => PhpLatex_Lexer::TYPE_TEXT, 'value' => 'A', 'line' => 1, 'column' => 1),
                 array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => ' ', 'line' => 1, 'column' => 2),
                 array('type' => PhpLatex_Lexer::TYPE_SPECIAL, 'value' => '%', 'line' => 1, 'column' => 3),
-                array('type' => PhpLatex_Lexer::TYPE_COMMENT, 'value' => ' comment', 'line' => 1, 'column' => 4),
+                array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => ' ', 'line' => 1, 'column' => 4),
+                array('type' => PhpLatex_Lexer::TYPE_TEXT, 'value' => 'comment', 'line' => 1, 'column' => 5),
                 array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => "\n", 'line' => 1, 'column' => 12),
                 array('type' => PhpLatex_Lexer::TYPE_TEXT, 'value' => 'B', 'line' => 2, 'column' => 1),
             )
@@ -49,7 +50,8 @@ public function testCommentOnly()
         }
         $this->assertEquals(array(
             array('type' => PhpLatex_Lexer::TYPE_SPECIAL, 'value' => '%', 'line' => 1, 'column' => 1),
-            array('type' => PhpLatex_Lexer::TYPE_COMMENT, 'value' => ' A', 'line' => 1, 'column' => 2),
+            array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => ' ', 'line' => 1, 'column' => 2),
+            array('type' => PhpLatex_Lexer::TYPE_TEXT, 'value' => 'A', 'line' => 1, 'column' => 3),
         ), $tokens);
     }
 
@@ -96,12 +98,27 @@ public function testTokens()
                 array('type' => PhpLatex_Lexer::TYPE_SPECIAL, 'value' => '}', 'line' => 5, 'column' => 15),
                 array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => ' ', 'line' => 5, 'column' => 16),
                 array('type' => PhpLatex_Lexer::TYPE_SPECIAL, 'value' => '%', 'line' => 5, 'column' => 17),
-                array('type' => PhpLatex_Lexer::TYPE_COMMENT, 'value' => ' comment in math mode', 'line' => 5, 'column' => 18),
+                array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => ' ', 'line' => 5, 'column' => 18),
+                array('type' => PhpLatex_Lexer::TYPE_TEXT, 'value' => 'comment', 'line' => 5, 'column' => 19),
+                array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => ' ', 'line' => 5, 'column' => 26),
+                array('type' => PhpLatex_Lexer::TYPE_TEXT, 'value' => 'in', 'line' => 5, 'column' => 27),
+                array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => ' ', 'line' => 5, 'column' => 29),
+                array('type' => PhpLatex_Lexer::TYPE_TEXT, 'value' => 'math', 'line' => 5, 'column' => 30),
+                array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => ' ', 'line' => 5, 'column' => 34),
+                array('type' => PhpLatex_Lexer::TYPE_TEXT, 'value' => 'mode', 'line' => 5, 'column' => 35),
                 array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => "\n", 'line' => 5, 'column' => 39),
                 array('type' => PhpLatex_Lexer::TYPE_CSYMBOL, 'value' => '\]', 'line' => 6, 'column' => 1),
                 array('type' => PhpLatex_Lexer::TYPE_CWORD, 'value' => '\par', 'raw' => "\n\n", 'line' => 6, 'column' => 3),
                 array('type' => PhpLatex_Lexer::TYPE_SPECIAL, 'value' => '%', 'line' => 8, 'column' => 1),
-                array('type' => PhpLatex_Lexer::TYPE_COMMENT, 'value' => '% Comment in text mode', 'line' => 8, 'column' => 2),
+                array('type' => PhpLatex_Lexer::TYPE_SPECIAL, 'value' => '%', 'line' => 8, 'column' => 2),
+                array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => ' ', 'line' => 8, 'column' => 3),
+                array('type' => PhpLatex_Lexer::TYPE_TEXT, 'value' => 'Comment', 'line' => 8, 'column' => 4),
+                array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => ' ', 'line' => 8, 'column' => 11),
+                array('type' => PhpLatex_Lexer::TYPE_TEXT, 'value' => 'in', 'line' => 8, 'column' => 12),
+                array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => ' ', 'line' => 8, 'column' => 14),
+                array('type' => PhpLatex_Lexer::TYPE_TEXT, 'value' => 'text', 'line' => 8, 'column' => 15),
+                array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => ' ', 'line' => 8, 'column' => 19),
+                array('type' => PhpLatex_Lexer::TYPE_TEXT, 'value' => 'mode', 'line' => 8, 'column' => 20),
                 array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => "\n", 'line' => 8, 'column' => 24),
                 array('type' => PhpLatex_Lexer::TYPE_TEXT, 'value' => 'End.', 'line' => 9, 'column' => 1),
             )
diff --git a/tests/PhpLatex/Test/Renderer/AbstractTest.php b/tests/PhpLatex/Test/Renderer/AbstractTest.php
index 853db78..a0f68dc 100644
--- a/tests/PhpLatex/Test/Renderer/AbstractTest.php
+++ b/tests/PhpLatex/Test/Renderer/AbstractTest.php
@@ -41,7 +41,7 @@ public function testIssue6()
         \eta_{12} \\\\
         \eta_{21} \\\\
         \eta_2
-    \end{array}
+    \end{array} % comment is here
 \]';
 
         $parser = new PhpLatex_Parser();