From a74d05c336ad458d01cce6bb247fbb3c2e5c5a13 Mon Sep 17 00:00:00 2001 From: fieg Date: Thu, 19 Mar 2015 17:01:42 +0100 Subject: [PATCH] fix breaks and spaces --- src/Markdownify/Converter.php | 152 ++++++++++++++++++++ test/Test/Markdownify/ConverterTestCase.php | 51 ++++++- 2 files changed, 202 insertions(+), 1 deletion(-) diff --git a/src/Markdownify/Converter.php b/src/Markdownify/Converter.php index 5caf99a..c028382 100644 --- a/src/Markdownify/Converter.php +++ b/src/Markdownify/Converter.php @@ -274,6 +274,8 @@ public function __construct($linkPosition = self::LINK_AFTER_CONTENT, $bodyWidth */ public function parseString($html) { + $html = $this->prepareHtml($html); + $this->parser->html = $html; $this->parse(); @@ -1327,4 +1329,154 @@ protected function parent() { return end($this->parser->openTags); } + + /** + * Helper method to prepare html for correct markdown parsing. This will correct BR tags inside other + * tags like EM or STRONG. + * + * For example: + * Hello,
How are you doing?
+ * Will be corrected to + * Hello,
How are you doing? + * + * @param \DOMDocument $dom + */ + protected function fixBreaks(\DOMDocument $dom) + { + /** @var \DOMNode[] $brs */ + $brs = $dom->getElementsByTagName('br'); + $stopTags = array('body', 'p'); + + foreach ($brs as $br) { + if ($br->parentNode && !in_array($br->parentNode->tagName, $stopTags)) { + $parent = $br->parentNode; + + /** @var \DOMNode[] $childNodes */ + $childNodes = $parent->childNodes; + $mainFragment = $dom->createDocumentFragment(); + $fragment = $dom->createDocumentFragment(); + + foreach ($childNodes as $childChild) { + if ($childChild->nodeName !== 'br') { + $fragment->appendChild($childChild->cloneNode(true)); + } else { + if ($fragment->hasChildNodes()) { + $newNode = $dom->createElement($parent->nodeName); + $newNode->appendChild($fragment); + + $mainFragment->appendChild($newNode); + + // reset fragment + $fragment = $dom->createDocumentFragment(); + } + + $mainFragment->appendChild($childChild->cloneNode(true)); + } + } + + if ($fragment->hasChildNodes()) { + $newNode = $dom->createElement($parent->nodeName); + $newNode->appendChild($fragment); + + $mainFragment->appendChild($newNode); + } + + $parent->parentNode->replaceChild($mainFragment, $parent); + + $this->fixBreaks($dom); + + break; + } + } + } + + /** + * Helper method to prepare html for correct markdown parsing. This will correct spaces around tags. + * It will correct spaces at begin tag and end tag. + * + * For example: + *

This is strong text

+ * Will be corrected to + *

This is strong text

+ * + * + * @param \DOMDocument $dom + * @param string $tagName + */ + protected function fixTagSpaces(\DOMDocument $dom, $tagName) + { + $elements = $dom->getElementsByTagName($tagName); + + /** @var \DOMNode $element */ + foreach ($elements as $element) { + if ($element->firstChild && $element->firstChild instanceof \DOMText && $element->firstChild->wholeText[0] === ' ') { + $element->replaceChild(new \DOMText(ltrim($element->firstChild->wholeText, ' ')), $element->firstChild); + $element->parentNode->insertBefore($dom->createTextNode(' '), $element); + } + + if ($element->lastChild && $element->lastChild instanceof \DOMText && substr($element->lastChild->wholeText, -1) === ' ') { + $element->replaceChild(new \DOMText(rtrim($element->lastChild->wholeText, ' ')), $element->lastChild); + if ($element->nextSibling) { + $element->nextSibling->parentNode->insertBefore($dom->createTextNode(' '), $element->nextSibling); + } else { + $element->parentNode->appendChild($dom->createTextNode(' ')); + } + } + } + } + + /** + * Returns inner html from a dom document node + * + * @param \DOMDocument $dom + * @param \DOMNode $node + * + * @return string + */ + protected function getInnerHtml(\DOMDocument $dom, \DOMNode $node) + { + $innerHtml = ''; + + foreach ($node->childNodes as $child) { + $innerHtml .= $dom->saveXML($child); + } + + return $innerHtml; + } + + /** + * Applies some fixes so we can better parse the html + * + * @param string $html + * + * @return string + */ + protected function prepareHtml($html) + { + $dom = new \DOMDocument(); + $dom->substituteEntities = false; + + // extra mb_convert_encoding pass http://stackoverflow.com/questions/8218230/php-domdocument-loadhtml-not-encoding-utf-8-correctly?answertab=active#tab-top + // in some environments the meta charset is not enough + $dom->loadHTML( + mb_convert_encoding( + '' . $html . '', + 'HTML-ENTITIES', + 'UTF-8' + ) + ); + + $this->fixBreaks($dom); + $this->fixTagSpaces($dom, 'em'); + $this->fixTagSpaces($dom, 'strong'); + $this->fixTagSpaces($dom, 'b'); + $this->fixTagSpaces($dom, 'i'); + + $body = $dom->getElementsByTagName('body'); + $preparedHtml = $this->getInnerHtml($dom, $body->item(0)); + + return $preparedHtml; + } + + } diff --git a/test/Test/Markdownify/ConverterTestCase.php b/test/Test/Markdownify/ConverterTestCase.php index d4d32b6..6794785 100644 --- a/test/Test/Markdownify/ConverterTestCase.php +++ b/test/Test/Markdownify/ConverterTestCase.php @@ -84,7 +84,7 @@ public function providerAutoescapeConversion() return array( array('AT&T', 'AT&T'), array('4 < 5', '4 < 5'), - array('©', '©') + array('©', '©') ); } @@ -432,4 +432,53 @@ public function providerRulesConversion() return $data; } + + + /* FIX BREAKS TESTS + *************************************************************************/ + + /** + * @dataProvider providerFixBreaks + */ + public function testFixBreaks($html, $md) + { + $this->assertEquals($md, $this->converter->parseString($html)); + } + + + public function providerFixBreaks() + { + $data = array(); + $data['break1']['html'] = "Hello,
How are you doing?
"; + $data['break1']['md'] = "**Hello,** \n**How are you doing?**"; + + return $data; + } + + /* FIX TAG SPACES TESTS + *************************************************************************/ + + /** + * @dataProvider providerFixTagSpaces + */ + public function testFixTagSpaces($html, $md) + { + $this->assertEquals($md, $this->converter->parseString($html)); + } + + + public function providerFixTagSpaces() + { + $data = array(); + $data['strong']['html'] = "

This is strong text

"; + $data['strong']['md'] = "This is **strong** text"; + $data['em']['html'] = "

This is italic text

"; + $data['em']['md'] = "This is _italic_ text"; + $data['b']['html'] = "

Not bold, bold

"; + $data['b']['md'] = "Not bold, **bold**"; + $data['i']['html'] = "

Not italic, italic

"; + $data['i']['md'] = "Not italic, _italic_"; + + return $data; + } }