diff --git a/src/Converter.php b/src/Converter.php index f157323..85ccc7f 100644 --- a/src/Converter.php +++ b/src/Converter.php @@ -133,6 +133,7 @@ class Converter protected $ignore = [ 'html', 'body', + 'o:p', ]; /** @@ -183,15 +184,15 @@ class Converter * TODO: what's with block chars / sequences at the beginning of a block? */ protected $escapeInText = [ - '\*\*([^*]+)\*\*' => '\*\*$1\*\*', // strong - '\*([^*]+)\*' => '\*$1\*', // em - '__(?! |_)(.+)(?!<_| )__' => '\_\_$1\_\_', // strong - '_(?! |_)(.+)(?!<_| )_' => '\_$1\_', // em + '\*' => '\\\\*', // * + '_' => '\\\\_', // _ + '\|' => '\\\\|', // | '([-*_])([ ]{0,2}\1){2,}' => '\\\\$0', // hr - '`' => '\`', // code - '\[(.+)\](\s*\()' => '\[$1\]$2', // links: [text] (url) => [text\] (url) - '\[(.+)\](\s*)\[(.*)\]' => '\[$1\]$2\[$3\]', // links: [text][id] => [text\][id\] - '^#(#{0,5}) ' => '\#$1 ', // header + '`' => '\\\\`', // code + '\[(.+)\](\s*\()' => '\\\\[$1\\\\]$2', // links: [text] (url) => [text\] (url) + '\[(.+)\](\s*)\[(.*)\]' => '\\\\[$1\\\\]$2\\\\[$3\\\\]', // links: [text][id] => [text\][id\] + '^#(#{0,5}) ' => '\\\\#$1 ', // header # + '^=(=*\h*)$' => '\\\\=$1', // header = ]; /** @@ -227,6 +228,13 @@ class Converter */ protected $indent = ''; + /** + * previous indentation, when we want to disable current indentation and get it back later + * + * @var string + */ + static $previousIndent = ''; + /** * constructor, set options, setup parser * @@ -255,7 +263,7 @@ public function __construct($linkPosition = self::LINK_AFTER_CONTENT, $bodyWidth $search = []; $replace = []; foreach ($this->escapeInText as $s => $r) { - array_push($search, '@(?escapeInText = [ @@ -274,6 +282,7 @@ public function parseString($html) { $this->resetState(); + $html = str_replace(array("\r\n", "\r"), "\n", $html); $this->parser->html = $html; $this->parse(); @@ -302,6 +311,16 @@ public function setKeepHTML($keepHTML) $this->keepHTML = $keepHTML; } + /** + * return escapeInText + * + * @return array escapeInText + */ + public function getescapeInText() + { + return $this->escapeInText; + } + /** * iterate through the nodes and decide what we * shall do with the current node @@ -329,6 +348,7 @@ protected function parse() // else drop break; case 'text': + $this->flushLinebreaks(); $this->handleText(); break; case 'tag': @@ -395,7 +415,8 @@ protected function parse() } } // cleanup - $this->output = rtrim(str_replace('&', '&', str_replace('<', '<', str_replace('>', '>', $this->output)))); + $this->output = implode("\n", array_map('rtrim', explode("\n", $this->output))); + $this->output = rtrim(str_replace(['&', '<', '>', ' '], ['&', '<', '>', ' '], $this->output)); // end parsing, flush stacked tags $this->flushFootnotes(); $this->stack = []; @@ -507,7 +528,7 @@ protected function handleTagToText() { if (!$this->keepHTML) { if (!$this->parser->isStartTag && $this->parser->isBlockElement) { - $this->setLineBreaks(2); + $this->setLineBreaks(1); } } else { // don't convert to markdown inside this tag @@ -534,8 +555,7 @@ protected function handleTagToText() // don't indent inside
 tags
                     if ($this->parser->tagName == 'pre') {
                         $this->out($this->parser->node);
-                        static $indent;
-                        $indent = $this->indent;
+                        $this->previousIndent = $this->indent;
                         $this->indent = '';
                     } else {
                         $this->out($this->parser->node . "\n" . $this->indent);
@@ -556,8 +576,7 @@ protected function handleTagToText()
                     } else {
                         // reset indentation
                         $this->out($this->parser->node);
-                        static $indent;
-                        $this->indent = $indent;
+                        $this->indent = $this->previousIndent;
                     }
 
                     if (in_array($this->parent(), ['ins', 'del'])) {
@@ -579,7 +598,7 @@ protected function handleTagToText()
                     $this->buffer();
                 } else {
                     // add stuff so cleanup just reverses this
-                    $this->out(str_replace('<', '&lt;', str_replace('>', '&gt;', $this->unbuffer())));
+                    $this->out(str_replace(['<', '>'], ['&lt;', '&gt;'], $this->unbuffer()));
                 }
             }
         }
@@ -733,6 +752,7 @@ protected function handleTag_p()
     {
         if (!$this->parser->isStartTag) {
             $this->setLineBreaks(2);
+            $this->parser->html = ltrim($this->parser->html);
         }
     }
 
@@ -786,7 +806,7 @@ protected function handleTag_a_converter($tag, $buffer)
             return '[' . $buffer . ']()';
         }
 
-        if ($buffer == $tag['href'] && empty($tag['title'])) {
+        if (rtrim($buffer, '/') == rtrim($tag['href'], '/') && empty($tag['title'])) {
             // 
             return '<' . $buffer . '>';
         }
diff --git a/src/ConverterExtra.php b/src/ConverterExtra.php
index 838672a..3595f3e 100644
--- a/src/ConverterExtra.php
+++ b/src/ConverterExtra.php
@@ -26,6 +26,13 @@ class ConverterExtra extends Converter
      */
     protected $row = 0;
 
+    /**
+     * Add CSS class after the tag
+     *
+     * @var bool
+     */
+    protected $addCssClass = true;
+
     /**
      * constructor, see Markdownify::Markdownify() for more information
      */
@@ -118,7 +125,7 @@ protected function handleHeader($level)
             $this->stack();
         } else {
             $tag = $this->unstack();
-            if (!empty($tag['cssSelector'])) {
+            if (!empty($tag['cssSelector']) && $this->addCssClass) {
                 // {#id.class}
                 $this->out(' {' . $tag['cssSelector'] . '}');
             }
@@ -148,7 +155,7 @@ protected function handleTag_a_parser()
     protected function handleTag_a_converter($tag, $buffer)
     {
         $output = parent::handleTag_a_converter($tag, $buffer);
-        if (!empty($tag['cssSelector'])) {
+        if (!empty($tag['cssSelector']) && $this->addCssClass) {
             // [This link][id]{#id.class}
             $output .= '{' . $tag['cssSelector'] . '}';
         }
@@ -295,13 +302,15 @@ protected function handleTag_table()
             $rows = [];
             // add padding
             array_walk_recursive($this->table['rows'], [&$this, 'alignTdContent']);
-            $header = array_shift($this->table['rows']);
-            array_push($rows, '| ' . implode(' | ', $header) . ' |');
-            array_push($rows, $separator);
-            foreach ($this->table['rows'] as $row) {
-                array_push($rows, '| ' . implode(' | ', $row) . ' |');
+            if (!empty( $this->table['rows'])) {
+                $header = array_shift($this->table['rows']);
+                array_push($rows, '| ' . implode(' | ', $header) . ' |');
+                array_push($rows, $separator);
+                foreach ($this->table['rows'] as $row) {
+                    array_push($rows, '| ' . implode(' | ', $row) . ' |');
+                }
+                $this->out(implode("\n" . $this->indent, $rows));
             }
-            $this->out(implode("\n" . $this->indent, $rows));
             $this->table = [];
             $this->setLineBreaks(2);
         }
@@ -568,4 +577,15 @@ protected function getCurrentCssSelector()
         }
         return $cssSelector;
     }
+
+    /**
+     * set add CSS class after the tag
+     *
+     * @param bool $addCssClass
+     * @return void
+     */
+    public function setAddCssClass($addCssClass)
+    {
+        $this->addCssClass = $addCssClass;
+    }
 }
diff --git a/src/Parser.php b/src/Parser.php
index 2e3291d..6cb3b64 100644
--- a/src/Parser.php
+++ b/src/Parser.php
@@ -7,6 +7,8 @@ class Parser
     public static $skipWhitespace = true;
     public static $a_ord;
     public static $z_ord;
+    public static $n0_ord;
+    public static $n9_ord;
     public static $special_ords;
 
     /**
@@ -172,6 +174,7 @@ class Parser
         'noframes' => true,
         'noscript' => true,
         'ol' => true,
+        'o:p' => true,
         'p' => true,
         'pre' => true,
         'table' => true,
@@ -354,6 +357,8 @@ protected function parseTag()
         if (!isset(static::$a_ord)) {
             static::$a_ord = ord('a');
             static::$z_ord = ord('z');
+            static::$n0_ord = ord('0');
+            static::$n9_ord = ord('9');
             static::$special_ords = [
                 ord(':'), // for xml:lang
                 ord('-'), // for http-equiv
@@ -370,7 +375,7 @@ protected function parseTag()
         // get tagName
         while (isset($this->html[$pos])) {
             $pos_ord = ord(strtolower($this->html[$pos]));
-            if (($pos_ord >= static::$a_ord && $pos_ord <= static::$z_ord) || (!empty($tagName) && is_numeric($this->html[$pos]))) {
+            if (($pos_ord >= static::$a_ord && $pos_ord <= static::$z_ord) || (!empty($tagName) && is_numeric($this->html[$pos])) || in_array($pos_ord, static::$special_ords)) {
                 $tagName .= $this->html[$pos];
                 $pos++;
             } else {
@@ -410,13 +415,13 @@ protected function parseTag()
             }
 
             $pos_ord = ord(strtolower($this->html[$pos]));
-            if (($pos_ord >= static::$a_ord && $pos_ord <= static::$z_ord) || in_array($pos_ord, static::$special_ords)) {
+            if (($pos_ord >= static::$a_ord && $pos_ord <= static::$z_ord) || in_array($pos_ord, static::$special_ords) || (substr($currAttrib, 0, 5) === 'xmlns' && $pos_ord >= static::$n0_ord && $pos_ord <= static::$n9_ord)) {
                 // attribute name
                 $currAttrib .= $this->html[$pos];
             } elseif (in_array($this->html[$pos], [' ', "\t", "\n"])) {
                 // drop whitespace
             } elseif (in_array($this->html[$pos] . $this->html[$pos + 1], ['="', "='"])) {
-                // get attribute value
+                // get quoted attribute value
                 $pos++;
                 $await = $this->html[$pos]; // single or double quote
                 $pos++;
@@ -427,6 +432,17 @@ protected function parseTag()
                 }
                 $attributes[$currAttrib] = $value;
                 $currAttrib = '';
+            } elseif ($this->html[$pos] === '=') {
+                // get unquoted attribute value
+                $pos++;
+                $value = '';
+                while (isset($this->html[$pos]) && !in_array($this->html[$pos], array(' ', '/', '>'), true)) {
+                    $value .= $this->html[$pos];
+                    $pos++;
+                }
+                $pos--;
+                $attributes[$currAttrib] = $value;
+                $currAttrib = '';
             } else {
                 $this->invalidTag();
 
diff --git a/test/ConverterTestCase.php b/test/ConverterTestCase.php
index 1930f8d..ec318ee 100644
--- a/test/ConverterTestCase.php
+++ b/test/ConverterTestCase.php
@@ -166,7 +166,7 @@ public function providerBlockquoteConversion()
         $data['simple']['md'] = '> blockquoted text goes here';
         $data['paragraphs']['html'] = '

paragraph1

paragraph2

'; $data['paragraphs']['md'] = '> paragraph1' . PHP_EOL - . '> ' . PHP_EOL + . '>' . PHP_EOL . '> paragraph2'; $data['cascade']['html'] = '
cascading blockquote
'; $data['cascade']['md'] = '> > cascading blockquote'; @@ -209,7 +209,7 @@ public function providerListConversion() . ' 2. Magic'; $data['next-to-text-in-block-context']['html'] = '
McHale
  1. Bird
  2. Magic
'; $data['next-to-text-in-block-context']['md'] = '> McHale' . PHP_EOL - . '> ' . PHP_EOL + . '>' . PHP_EOL . '> 1. Bird' . PHP_EOL . '> 2. Magic'; $data['next-to-bold']['html'] = 'McHale
  1. Bird
  2. Magic
'; @@ -218,7 +218,7 @@ public function providerListConversion() . ' 1. Bird' . PHP_EOL . ' 2. Magic'; $data['next-to-bold-and-br']['html'] = 'McHale
  1. Bird
  2. Magic
'; - $data['next-to-bold-and-br']['md'] = '**McHale** ' . PHP_EOL + $data['next-to-bold-and-br']['md'] = '**McHale**' . PHP_EOL . PHP_EOL . PHP_EOL . ' 1. Bird' . PHP_EOL @@ -482,7 +482,7 @@ public function providerRulesConversion() $data['escape-']['html'] = '-----------------------------------'; $data['escape-']['md'] = '\---\---\---\---\---\---\---\---\---\---\-----'; $data['escape-']['html'] = '*****************'; - $data['escape-']['md'] = '\***\***\***\***\*****'; + $data['escape-']['md'] = '\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*'; return $data; } @@ -504,9 +504,9 @@ public function providerFixBreaks() { $data = []; $data['break1']['html'] = "Hello,
How are you doing?
"; - $data['break1']['md'] = "**Hello, \nHow are you doing?**"; + $data['break1']['md'] = "**Hello,\nHow are you doing?**"; $data['break2']['html'] = "Hey,
How you're doing?


Sorry

You can't get through
"; - $data['break2']['md'] = "**Hey, \nHow you're doing?** \n \n**Sorry \n \nYou can't get through**"; + $data['break2']['md'] = "**Hey,\nHow you're doing?**\n\n**Sorry\n\nYou can't get through**"; return $data; }