Elephant418 · SL-Gundam · Feb 3, 2018 · Feb 3, 2018 · Feb 4, 2018 · Feb 4, 2018
diff --git a/src/Converter.php b/src/Converter.php
@@ -133,6 +133,7 @@ class Converter
     protected $ignore = [
         'html',
         'body',
+        'o:p',
     ];
 
     /**
@@ -183,15 +184,15 @@ class Converter
      * TODO: what's with block chars / sequences at the beginning of a block?
      */
     protected $escapeInText = [
-        '\*\*([^*]+)\*\*' => '\*\*$1\*\*', // strong
-        '\*([^*]+)\*' => '\*$1\*', // em
-        '__(?! |_)(.+)(?!<_| )__' => '\_\_$1\_\_', // strong
-        '_(?! |_)(.+)(?!<_| )_' => '\_$1\_', // em
+        '\*' => '\\\\*', // *
+        '_' => '\\\\_', // _
+        '\|' => '\\\\|', // |
         '([-*_])([ ]{0,2}\1){2,}' => '\\\\$0', // hr
-        '`' => '\`', // code
-        '\[(.+)\](\s*\()' => '\[$1\]$2', // links: [text] (url) => [text\] (url)
-        '\[(.+)\](\s*)\[(.*)\]' => '\[$1\]$2\[$3\]', // links: [text][id] => [text\][id\]
-        '^#(#{0,5}) ' => '\#$1 ', // header
+        '`' => '\\\\`', // code
+        '\[(.+)\](\s*\()' => '\\\\[$1\\\\]$2', // links: [text] (url) => [text\] (url)
+        '\[(.+)\](\s*)\[(.*)\]' => '\\\\[$1\\\\]$2\\\\[$3\\\\]', // links: [text][id] => [text\][id\]
+        '^#(#{0,5}) ' => '\\\\#$1 ', // header #
+        '^=(=*\h*)$' => '\\\\=$1', // header =
     ];
 
     /**
@@ -227,6 +228,13 @@ class Converter
      */
     protected $indent = '';
 
+    /**
+     * previous indentation, when we want to disable current indentation and get it back later
+     *
+     * @var string
+     */
+    static $previousIndent = '';
+
     /**
      * constructor, set options, setup parser
      *
@@ -255,7 +263,7 @@ public function __construct($linkPosition = self::LINK_AFTER_CONTENT, $bodyWidth
         $search = [];
         $replace = [];
         foreach ($this->escapeInText as $s => $r) {
-            array_push($search, '@(?<!\\\)' . $s . '@U');
+            array_push($search, '@(?<!\\\)' . $s . '@mU');
             array_push($replace, $r);
         }
         $this->escapeInText = [
@@ -274,6 +282,7 @@ public function parseString($html)
     {
         $this->resetState();
 
+        $html = str_replace(array("\r\n", "\r"), "\n", $html);
         $this->parser->html = $html;
         $this->parse();
 
@@ -302,6 +311,16 @@ public function setKeepHTML($keepHTML)
         $this->keepHTML = $keepHTML;
     }
 
+    /**
+     * return escapeInText
+     *
+     * @return array escapeInText
+     */
+    public function getescapeInText()
+    {
+        return $this->escapeInText;
+    }
+
     /**
      * iterate through the nodes and decide what we
      * shall do with the current node
@@ -329,6 +348,7 @@ protected function parse()
                     // else drop
                     break;
                 case 'text':
+                    $this->flushLinebreaks();
                     $this->handleText();
                     break;
                 case 'tag':
@@ -395,7 +415,8 @@ protected function parse()
             }
         }
         // cleanup
-        $this->output = rtrim(str_replace('&amp;', '&', str_replace('&lt;', '<', str_replace('&gt;', '>', $this->output))));
+        $this->output = implode("\n", array_map('rtrim', explode("\n", $this->output)));
+        $this->output = rtrim(str_replace(['&amp;', '&lt;', '&gt;', '&nbsp;'], ['&', '<', '>', ' '], $this->output));
         // end parsing, flush stacked tags
         $this->flushFootnotes();
         $this->stack = [];
@@ -507,7 +528,7 @@ protected function handleTagToText()
     {
         if (!$this->keepHTML) {
             if (!$this->parser->isStartTag && $this->parser->isBlockElement) {
-                $this->setLineBreaks(2);
+                $this->setLineBreaks(1);
             }
         } else {
             // don't convert to markdown inside this tag
@@ -534,8 +555,7 @@ protected function handleTagToText()
                     // don't indent inside <pre> tags
                     if ($this->parser->tagName == 'pre') {
                         $this->out($this->parser->node);
-                        static $indent;
-                        $indent = $this->indent;
+                        $this->previousIndent = $this->indent;
                         $this->indent = '';
                     } else {
                         $this->out($this->parser->node . "\n" . $this->indent);
@@ -556,8 +576,7 @@ protected function handleTagToText()
                     } else {
                         // reset indentation
                         $this->out($this->parser->node);
-                        static $indent;
-                        $this->indent = $indent;
+                        $this->indent = $this->previousIndent;
                     }
 
                     if (in_array($this->parent(), ['ins', 'del'])) {
@@ -579,7 +598,7 @@ protected function handleTagToText()
                     $this->buffer();
                 } else {
                     // add stuff so cleanup just reverses this
-                    $this->out(str_replace('&lt;', '&amp;lt;', str_replace('&gt;', '&amp;gt;', $this->unbuffer())));
+                    $this->out(str_replace(['&lt;', '&gt;'], ['&amp;lt;', '&amp;gt;'], $this->unbuffer()));
                 }
             }
         }
@@ -733,6 +752,7 @@ protected function handleTag_p()
     {
         if (!$this->parser->isStartTag) {
             $this->setLineBreaks(2);
+            $this->parser->html = ltrim($this->parser->html);
         }
     }
 
@@ -786,7 +806,7 @@ protected function handleTag_a_converter($tag, $buffer)
             return '[' . $buffer . ']()';
         }
 
-        if ($buffer == $tag['href'] && empty($tag['title'])) {
+        if (rtrim($buffer, '/') == rtrim($tag['href'], '/') && empty($tag['title'])) {
             // <http://example.com>
             return '<' . $buffer . '>';
         }

diff --git a/src/ConverterExtra.php b/src/ConverterExtra.php
@@ -26,6 +26,13 @@ class ConverterExtra extends Converter
      */
     protected $row = 0;
 
+    /**
+     * Add CSS class after the tag
+     *
+     * @var bool
+     */
+    protected $addCssClass = true;
+
     /**
      * constructor, see Markdownify::Markdownify() for more information
      */
@@ -118,7 +125,7 @@ protected function handleHeader($level)
             $this->stack();
         } else {
             $tag = $this->unstack();
-            if (!empty($tag['cssSelector'])) {
+            if (!empty($tag['cssSelector']) && $this->addCssClass) {
                 // {#id.class}
                 $this->out(' {' . $tag['cssSelector'] . '}');
             }
@@ -148,7 +155,7 @@ protected function handleTag_a_parser()
     protected function handleTag_a_converter($tag, $buffer)
     {
         $output = parent::handleTag_a_converter($tag, $buffer);
-        if (!empty($tag['cssSelector'])) {
+        if (!empty($tag['cssSelector']) && $this->addCssClass) {
             // [This link][id]{#id.class}
             $output .= '{' . $tag['cssSelector'] . '}';
         }
@@ -295,13 +302,15 @@ protected function handleTag_table()
             $rows = [];
             // add padding
             array_walk_recursive($this->table['rows'], [&$this, 'alignTdContent']);
-            $header = array_shift($this->table['rows']);
-            array_push($rows, '| ' . implode(' | ', $header) . ' |');
-            array_push($rows, $separator);
-            foreach ($this->table['rows'] as $row) {
-                array_push($rows, '| ' . implode(' | ', $row) . ' |');
+            if (!empty( $this->table['rows'])) {
+                $header = array_shift($this->table['rows']);
+                array_push($rows, '| ' . implode(' | ', $header) . ' |');
+                array_push($rows, $separator);
+                foreach ($this->table['rows'] as $row) {
+                    array_push($rows, '| ' . implode(' | ', $row) . ' |');
+                }
+                $this->out(implode("\n" . $this->indent, $rows));
             }
-            $this->out(implode("\n" . $this->indent, $rows));
             $this->table = [];
             $this->setLineBreaks(2);
         }
@@ -568,4 +577,15 @@ protected function getCurrentCssSelector()
         }
         return $cssSelector;
     }
+
+    /**
+     * set add CSS class after the tag
+     *
+     * @param bool $addCssClass
+     * @return void
+     */
+    public function setAddCssClass($addCssClass)
+    {
+        $this->addCssClass = $addCssClass;
+    }
 }
diff --git a/src/Parser.php b/src/Parser.php
@@ -7,6 +7,8 @@ class Parser
     public static $skipWhitespace = true;
     public static $a_ord;
     public static $z_ord;
+    public static $n0_ord;
+    public static $n9_ord;
     public static $special_ords;
 
     /**
@@ -172,6 +174,7 @@ class Parser
         'noframes' => true,
         'noscript' => true,
         'ol' => true,
+        'o:p' => true,
         'p' => true,
         'pre' => true,
         'table' => true,
@@ -354,6 +357,8 @@ protected function parseTag()
         if (!isset(static::$a_ord)) {
             static::$a_ord = ord('a');
             static::$z_ord = ord('z');
+            static::$n0_ord = ord('0');
+            static::$n9_ord = ord('9');
             static::$special_ords = [
                 ord(':'), // for xml:lang
                 ord('-'), // for http-equiv
@@ -370,7 +375,7 @@ protected function parseTag()
         // get tagName
         while (isset($this->html[$pos])) {
             $pos_ord = ord(strtolower($this->html[$pos]));
-            if (($pos_ord >= static::$a_ord && $pos_ord <= static::$z_ord) || (!empty($tagName) && is_numeric($this->html[$pos]))) {
+            if (($pos_ord >= static::$a_ord && $pos_ord <= static::$z_ord) || (!empty($tagName) && is_numeric($this->html[$pos])) || in_array($pos_ord, static::$special_ords)) {
                 $tagName .= $this->html[$pos];
                 $pos++;
             } else {
@@ -410,13 +415,13 @@ protected function parseTag()
             }
 
             $pos_ord = ord(strtolower($this->html[$pos]));
-            if (($pos_ord >= static::$a_ord && $pos_ord <= static::$z_ord) || in_array($pos_ord, static::$special_ords)) {
+            if (($pos_ord >= static::$a_ord && $pos_ord <= static::$z_ord) || in_array($pos_ord, static::$special_ords) || (substr($currAttrib, 0, 5) === 'xmlns' && $pos_ord >= static::$n0_ord && $pos_ord <= static::$n9_ord)) {
                 // attribute name
                 $currAttrib .= $this->html[$pos];
             } elseif (in_array($this->html[$pos], [' ', "\t", "\n"])) {
                 // drop whitespace
             } elseif (in_array($this->html[$pos] . $this->html[$pos + 1], ['="', "='"])) {
-                // get attribute value
+                // get quoted attribute value
                 $pos++;
                 $await = $this->html[$pos]; // single or double quote
                 $pos++;
@@ -427,6 +432,17 @@ protected function parseTag()
                 }
                 $attributes[$currAttrib] = $value;
                 $currAttrib = '';
+            } elseif ($this->html[$pos] === '=') {
+                // get unquoted attribute value
+                $pos++;
+                $value = '';
+                while (isset($this->html[$pos]) && !in_array($this->html[$pos], array(' ', '/', '>'), true)) {
+                    $value .= $this->html[$pos];
+                    $pos++;
+                }
+                $pos--;
+                $attributes[$currAttrib] = $value;
+                $currAttrib = '';
             } else {
                 $this->invalidTag();
 

diff --git a/test/ConverterTestCase.php b/test/ConverterTestCase.php
@@ -166,7 +166,7 @@ public function providerBlockquoteConversion()
         $data['simple']['md'] = '> blockquoted text goes here';
         $data['paragraphs']['html'] = '<blockquote><p>paragraph1</p><p>paragraph2</p></blockquote>';
         $data['paragraphs']['md'] = '> paragraph1' . PHP_EOL
-            . '> ' . PHP_EOL
+            . '>' . PHP_EOL
             . '> paragraph2';
         $data['cascade']['html'] = '<blockquote><blockquote>cascading blockquote</blockquote></blockquote>';
         $data['cascade']['md'] = '> > cascading blockquote';
@@ -209,7 +209,7 @@ public function providerListConversion()
             . '  2. Magic';
         $data['next-to-text-in-block-context']['html'] = '<blockquote>McHale<ol><li>Bird</li><li>Magic</li></ol></blockquote>';
         $data['next-to-text-in-block-context']['md'] = '> McHale' . PHP_EOL
-            . '> ' . PHP_EOL
+            . '>' . PHP_EOL
             . '>   1. Bird' . PHP_EOL
             . '>   2. Magic';
         $data['next-to-bold']['html'] = '<b>McHale</b><ol><li>Bird</li><li>Magic</li></ol>';
@@ -218,7 +218,7 @@ public function providerListConversion()
             . '  1. Bird' . PHP_EOL
             . '  2. Magic';
         $data['next-to-bold-and-br']['html'] = '<b>McHale</b><br><ol><li>Bird</li><li>Magic</li></ol>';
-        $data['next-to-bold-and-br']['md'] = '**McHale**  ' . PHP_EOL
+        $data['next-to-bold-and-br']['md'] = '**McHale**' . PHP_EOL
             . PHP_EOL
             . PHP_EOL
             . '  1. Bird' . PHP_EOL
@@ -482,7 +482,7 @@ public function providerRulesConversion()
         $data['escape-']['html'] = '-----------------------------------';
         $data['escape-']['md'] = '\---\---\---\---\---\---\---\---\---\---\-----';
         $data['escape-']['html'] = '*****************';
-        $data['escape-']['md'] = '\***\***\***\***\*****';
+        $data['escape-']['md'] = '\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*';
 
         return $data;
     }
@@ -504,9 +504,9 @@ public function providerFixBreaks()
     {
         $data = [];
         $data['break1']['html'] = "<strong>Hello,<br>How are you doing?</strong>";
-        $data['break1']['md'] = "**Hello,  \nHow are you doing?**";
+        $data['break1']['md'] = "**Hello,\nHow are you doing?**";
         $data['break2']['html'] = "<b>Hey,<br> How you're doing?</b><br><br><b>Sorry<br><br> You can't get through</b>";
-        $data['break2']['md'] = "**Hey,  \nHow you're doing?**  \n  \n**Sorry  \n  \nYou can't get through**";
+        $data['break2']['md'] = "**Hey,\nHow you're doing?**\n\n**Sorry\n\nYou can't get through**";
 
         return $data;
     }