Skip to content

Commit

Permalink
Tokenize comment contents
Browse files Browse the repository at this point in the history
This will allow context-dependent handling of special characters such as '%' in urls
or parsing verbatim environments without resorting to search/replace preprocessing.
  • Loading branch information
xemlock committed Apr 4, 2023
1 parent bbdf03e commit d53a5da
Show file tree
Hide file tree
Showing 4 changed files with 53 additions and 66 deletions.
38 changes: 3 additions & 35 deletions library/PhpLatex/Lexer.php
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,14 @@ class PhpLatex_Lexer
const STATE_BSLASH = 1;
const STATE_CONTROL = 2;
const STATE_SPACE = 3;
const STATE_COMMENT = 4;

const TYPE_TEXT = 'text';
const TYPE_SPACE = 'space';
const TYPE_CWORD = 'cword';
const TYPE_CSYMBOL = 'csymbol';
const TYPE_SPECIAL = 'special';

/** @deprecated */
const TYPE_COMMENT = 'comment';

protected $_str;
Expand Down Expand Up @@ -96,30 +97,6 @@ public function next()
$buf = '';

do {
// special handling for comments - if we're in the comment state parse everything up to first newline
// no need to match it char by char
if ($this->_state === self::STATE_COMMENT) {
// at this point $this->_pos points to first char after '%' which started the comment.
// _line and _column still point to position of '%'
// The \G assertion is true only when the current matching position is at the start
// point of the match, as specified by the offset argument.
// https://www.php.net/manual/en/regexp.reference.escape.php
preg_match('#\G(?<comment>.*)#', $this->_str, $matches, 0, $this->_pos);

if (strlen($matches['comment'])) {
$this->_column++; // normally column would be incremented in _getChar()
$this->storeTokenPosition();

// adjust counters, so that call to _getChar()
$this->_pos += strlen($matches['comment']);
$this->_column += strlen($matches['comment']) - 1;

return $this->_setToken(self::TYPE_COMMENT, $matches['comment']);
} else {
$this->_state = self::STATE_DEFAULT;
}
}

$c = $this->_getChar();

switch ($c) {
Expand Down Expand Up @@ -246,9 +223,7 @@ public function next()

$this->storeTokenPosition();

$token = $this->_setToken(self::TYPE_SPECIAL, '%');
$this->_state = self::STATE_COMMENT;
return $token;
return $this->_setToken(self::TYPE_SPECIAL, '%');

case self::STATE_BSLASH:
return $this->_setToken(self::TYPE_CSYMBOL, '\\%');
Expand All @@ -261,13 +236,6 @@ public function next()
case self::STATE_SPACE:
$this->_ungetChar();
return $this->_setSpaceToken($buf);

case self::STATE_COMMENT:
if (!strlen($buf)) {
$this->storeTokenPosition();
}
$buf .= $c;
break;
}
break;

Expand Down
54 changes: 28 additions & 26 deletions library/PhpLatex/Parser.php
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,7 @@ protected function _parseExpr($state, $environ = null) // {{{
return $this->_parseText($token, $state);

case PhpLatex_Lexer::TYPE_COMMENT:
$this->_skipSpaces();
$this->_skipSpacesAndComments();
break;

default:
Expand Down Expand Up @@ -513,7 +513,7 @@ protected function _parseControl($token, $mode, $environ = null) // {{{
// Skip all spaces and comments occurring after this token, if this
// token is a control word.
if ($token['type'] === PhpLatex_Lexer::TYPE_CWORD) {
$this->_skipSpaces();
$this->_skipSpacesAndComments();
}

$mathWrapper = null;
Expand Down Expand Up @@ -624,29 +624,32 @@ protected function _parseControl($token, $mode, $environ = null) // {{{
*
* After this function has run current token, if exists, is neither space
* nor comment.
*
* @return array skipped SPACE and COMMENT tokens
*/
protected function _skipSpaces()
protected function _skipSpacesAndComments($inComment = false)
{
$skipped = array();
while ($next = $this->_peek()) {
if ($next['type'] === PhpLatex_Lexer::TYPE_SPACE ||
$next['type'] === PhpLatex_Lexer::TYPE_COMMENT ||
($next['type'] === PhpLatex_Lexer::TYPE_SPECIAL && $next['value'] === '%')
) {
$skipped[] = $next;
$this->_next();
if ($inComment) {
if (isset($next['raw']) && strpos($next['raw'], "\n") !== false) {
$inComment = false;
} else {
$this->_next();
}
} else {
break;
if ($next['type'] === PhpLatex_Lexer::TYPE_SPECIAL && $next['value'] === '%') {
$inComment = true;
$this->_next();
} else if ($next['type'] === PhpLatex_Lexer::TYPE_SPACE) {
$this->_next();
} else {
break;
}
}
}
return $skipped;
}

protected function _parseArg($mode, $environ, $parseArgs = true) // {{{
{
$this->_skipSpaces();
$this->_skipSpacesAndComments();

if ($next = $this->_peek()) {
switch ($next['type']) {
Expand Down Expand Up @@ -761,7 +764,7 @@ protected function _parseArg($mode, $environ, $parseArgs = true) // {{{
*/
protected function _parseOptArg($state, $environ) // {{{
{
$this->_skipSpaces();
$this->_skipSpacesAndComments();

if (($next = $this->_peek()) &&
($next['type'] === PhpLatex_Lexer::TYPE_SPECIAL) &&
Expand Down Expand Up @@ -789,16 +792,11 @@ protected function _parseOptArg($state, $environ) // {{{
*/
protected function _parseEnvName() // {{{
{
while (false !== ($next = $this->_peek())) {
if ($next['type'] === PhpLatex_Lexer::TYPE_SPACE ||
$next['type'] === PhpLatex_Lexer::TYPE_COMMENT ||
($next['type'] === PhpLatex_Lexer::TYPE_SPECIAL && $next['value'] === '%')
) {
// 1. skip spaces and comments
$this->_next();
continue;
// 1. Skip spaces and comments
$this->_skipSpacesAndComments();

} elseif ($next['value'] !== '{') {
while (false !== ($next = $this->_peek())) {
if ($next['value'] !== '{') {
// 2A. first encountered non-space token is not a curly bracket
// Since start of group was expected, this token breaks opening
// of an environment. Give it back and report failure.
Expand Down Expand Up @@ -959,6 +957,10 @@ protected function _parseSpecial($token, $state, $environ) // {{{
$node->value = $value;
return $node;

case '%':
$this->_skipSpacesAndComments(true);
break;

case '#':
// currently not supported
break;
Expand All @@ -981,7 +983,7 @@ protected function _parseLeftRight($token, $mode, $environs)

$environs = (array) $environs;

$this->_skipSpaces();
$this->_skipSpacesAndComments();
$next = $this->_peek();
if (!$next) {
return false;
Expand Down
25 changes: 21 additions & 4 deletions tests/PhpLatex/Test/LexerTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ public function testComment()
array('type' => PhpLatex_Lexer::TYPE_TEXT, 'value' => 'A', 'line' => 1, 'column' => 1),
array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => ' ', 'line' => 1, 'column' => 2),
array('type' => PhpLatex_Lexer::TYPE_SPECIAL, 'value' => '%', 'line' => 1, 'column' => 3),
array('type' => PhpLatex_Lexer::TYPE_COMMENT, 'value' => ' comment', 'line' => 1, 'column' => 4),
array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => ' ', 'line' => 1, 'column' => 4),
array('type' => PhpLatex_Lexer::TYPE_TEXT, 'value' => 'comment', 'line' => 1, 'column' => 5),
array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => "\n", 'line' => 1, 'column' => 12),
array('type' => PhpLatex_Lexer::TYPE_TEXT, 'value' => 'B', 'line' => 2, 'column' => 1),
)
Expand Down Expand Up @@ -49,7 +50,8 @@ public function testCommentOnly()
}
$this->assertEquals(array(
array('type' => PhpLatex_Lexer::TYPE_SPECIAL, 'value' => '%', 'line' => 1, 'column' => 1),
array('type' => PhpLatex_Lexer::TYPE_COMMENT, 'value' => ' A', 'line' => 1, 'column' => 2),
array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => ' ', 'line' => 1, 'column' => 2),
array('type' => PhpLatex_Lexer::TYPE_TEXT, 'value' => 'A', 'line' => 1, 'column' => 3),
), $tokens);
}

Expand Down Expand Up @@ -96,12 +98,27 @@ public function testTokens()
array('type' => PhpLatex_Lexer::TYPE_SPECIAL, 'value' => '}', 'line' => 5, 'column' => 15),
array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => ' ', 'line' => 5, 'column' => 16),
array('type' => PhpLatex_Lexer::TYPE_SPECIAL, 'value' => '%', 'line' => 5, 'column' => 17),
array('type' => PhpLatex_Lexer::TYPE_COMMENT, 'value' => ' comment in math mode', 'line' => 5, 'column' => 18),
array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => ' ', 'line' => 5, 'column' => 18),
array('type' => PhpLatex_Lexer::TYPE_TEXT, 'value' => 'comment', 'line' => 5, 'column' => 19),
array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => ' ', 'line' => 5, 'column' => 26),
array('type' => PhpLatex_Lexer::TYPE_TEXT, 'value' => 'in', 'line' => 5, 'column' => 27),
array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => ' ', 'line' => 5, 'column' => 29),
array('type' => PhpLatex_Lexer::TYPE_TEXT, 'value' => 'math', 'line' => 5, 'column' => 30),
array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => ' ', 'line' => 5, 'column' => 34),
array('type' => PhpLatex_Lexer::TYPE_TEXT, 'value' => 'mode', 'line' => 5, 'column' => 35),
array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => "\n", 'line' => 5, 'column' => 39),
array('type' => PhpLatex_Lexer::TYPE_CSYMBOL, 'value' => '\]', 'line' => 6, 'column' => 1),
array('type' => PhpLatex_Lexer::TYPE_CWORD, 'value' => '\par', 'raw' => "\n\n", 'line' => 6, 'column' => 3),
array('type' => PhpLatex_Lexer::TYPE_SPECIAL, 'value' => '%', 'line' => 8, 'column' => 1),
array('type' => PhpLatex_Lexer::TYPE_COMMENT, 'value' => '% Comment in text mode', 'line' => 8, 'column' => 2),
array('type' => PhpLatex_Lexer::TYPE_SPECIAL, 'value' => '%', 'line' => 8, 'column' => 2),
array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => ' ', 'line' => 8, 'column' => 3),
array('type' => PhpLatex_Lexer::TYPE_TEXT, 'value' => 'Comment', 'line' => 8, 'column' => 4),
array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => ' ', 'line' => 8, 'column' => 11),
array('type' => PhpLatex_Lexer::TYPE_TEXT, 'value' => 'in', 'line' => 8, 'column' => 12),
array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => ' ', 'line' => 8, 'column' => 14),
array('type' => PhpLatex_Lexer::TYPE_TEXT, 'value' => 'text', 'line' => 8, 'column' => 15),
array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => ' ', 'line' => 8, 'column' => 19),
array('type' => PhpLatex_Lexer::TYPE_TEXT, 'value' => 'mode', 'line' => 8, 'column' => 20),
array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => "\n", 'line' => 8, 'column' => 24),
array('type' => PhpLatex_Lexer::TYPE_TEXT, 'value' => 'End.', 'line' => 9, 'column' => 1),
)
Expand Down
2 changes: 1 addition & 1 deletion tests/PhpLatex/Test/Renderer/AbstractTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ public function testIssue6()
\eta_{12} \\\\
\eta_{21} \\\\
\eta_2
\end{array}
\end{array} % comment is here
\]';

$parser = new PhpLatex_Parser();
Expand Down

0 comments on commit d53a5da

Please sign in to comment.