Skip to content

Commit

Permalink
Merge pull request #12 from fieg/html-fixes
Browse files Browse the repository at this point in the history
Fix HTML breaks & spaces before parsing.
  • Loading branch information
tzi committed Mar 26, 2015
2 parents ffbf923 + a74d05c commit 4689418
Show file tree
Hide file tree
Showing 2 changed files with 202 additions and 1 deletion.
152 changes: 152 additions & 0 deletions src/Markdownify/Converter.php
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,8 @@ public function __construct($linkPosition = self::LINK_AFTER_CONTENT, $bodyWidth
*/
public function parseString($html)
{
$html = $this->prepareHtml($html);

$this->parser->html = $html;
$this->parse();

Expand Down Expand Up @@ -1327,4 +1329,154 @@ protected function parent()
{
return end($this->parser->openTags);
}

/**
* Helper method to prepare html for correct markdown parsing. This will correct BR tags inside other
* tags like EM or STRONG.
*
* For example:
* <strong>Hello,<br>How are you doing?</strong>
* Will be corrected to
* <strong>Hello,</strong><br><strong>How are you doing?</strong>
*
* @param \DOMDocument $dom
*/
protected function fixBreaks(\DOMDocument $dom)
{
/** @var \DOMNode[] $brs */
$brs = $dom->getElementsByTagName('br');
$stopTags = array('body', 'p');

foreach ($brs as $br) {
if ($br->parentNode && !in_array($br->parentNode->tagName, $stopTags)) {
$parent = $br->parentNode;

/** @var \DOMNode[] $childNodes */
$childNodes = $parent->childNodes;
$mainFragment = $dom->createDocumentFragment();
$fragment = $dom->createDocumentFragment();

foreach ($childNodes as $childChild) {
if ($childChild->nodeName !== 'br') {
$fragment->appendChild($childChild->cloneNode(true));
} else {
if ($fragment->hasChildNodes()) {
$newNode = $dom->createElement($parent->nodeName);
$newNode->appendChild($fragment);

$mainFragment->appendChild($newNode);

// reset fragment
$fragment = $dom->createDocumentFragment();
}

$mainFragment->appendChild($childChild->cloneNode(true));
}
}

if ($fragment->hasChildNodes()) {
$newNode = $dom->createElement($parent->nodeName);
$newNode->appendChild($fragment);

$mainFragment->appendChild($newNode);
}

$parent->parentNode->replaceChild($mainFragment, $parent);

$this->fixBreaks($dom);

break;
}
}
}

/**
* Helper method to prepare html for correct markdown parsing. This will correct spaces around tags.
* It will correct spaces at begin tag and end tag.
*
* For example:
* <p>This is<strong> strong</strong> text</p>
* Will be corrected to
* <p>This is <strong>strong</strong> text</p>
*
*
* @param \DOMDocument $dom
* @param string $tagName
*/
protected function fixTagSpaces(\DOMDocument $dom, $tagName)
{
$elements = $dom->getElementsByTagName($tagName);

/** @var \DOMNode $element */
foreach ($elements as $element) {
if ($element->firstChild && $element->firstChild instanceof \DOMText && $element->firstChild->wholeText[0] === ' ') {
$element->replaceChild(new \DOMText(ltrim($element->firstChild->wholeText, ' ')), $element->firstChild);
$element->parentNode->insertBefore($dom->createTextNode(' '), $element);
}

if ($element->lastChild && $element->lastChild instanceof \DOMText && substr($element->lastChild->wholeText, -1) === ' ') {
$element->replaceChild(new \DOMText(rtrim($element->lastChild->wholeText, ' ')), $element->lastChild);
if ($element->nextSibling) {
$element->nextSibling->parentNode->insertBefore($dom->createTextNode(' '), $element->nextSibling);
} else {
$element->parentNode->appendChild($dom->createTextNode(' '));
}
}
}
}

/**
* Returns inner html from a dom document node
*
* @param \DOMDocument $dom
* @param \DOMNode $node
*
* @return string
*/
protected function getInnerHtml(\DOMDocument $dom, \DOMNode $node)
{
$innerHtml = '';

foreach ($node->childNodes as $child) {
$innerHtml .= $dom->saveXML($child);
}

return $innerHtml;
}

/**
* Applies some fixes so we can better parse the html
*
* @param string $html
*
* @return string
*/
protected function prepareHtml($html)
{
$dom = new \DOMDocument();
$dom->substituteEntities = false;

// extra mb_convert_encoding pass http://stackoverflow.com/questions/8218230/php-domdocument-loadhtml-not-encoding-utf-8-correctly?answertab=active#tab-top
// in some environments the meta charset is not enough
$dom->loadHTML(
mb_convert_encoding(
'<html><head><meta charset="utf-8"></head><body>' . $html . '</body></html>',
'HTML-ENTITIES',
'UTF-8'
)
);

$this->fixBreaks($dom);
$this->fixTagSpaces($dom, 'em');
$this->fixTagSpaces($dom, 'strong');
$this->fixTagSpaces($dom, 'b');
$this->fixTagSpaces($dom, 'i');

$body = $dom->getElementsByTagName('body');
$preparedHtml = $this->getInnerHtml($dom, $body->item(0));

return $preparedHtml;
}


}
51 changes: 50 additions & 1 deletion test/Test/Markdownify/ConverterTestCase.php
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ public function providerAutoescapeConversion()
return array(
array('AT&amp;T', 'AT&T'),
array('4 &lt; 5', '4 < 5'),
array('&copy;', '&copy;')
array('&copy;', '©')
);
}

Expand Down Expand Up @@ -432,4 +432,53 @@ public function providerRulesConversion()

return $data;
}


/* FIX BREAKS TESTS
*************************************************************************/

/**
* @dataProvider providerFixBreaks
*/
public function testFixBreaks($html, $md)
{
$this->assertEquals($md, $this->converter->parseString($html));
}


public function providerFixBreaks()
{
$data = array();
$data['break1']['html'] = "<strong>Hello,<br>How are you doing?</strong>";
$data['break1']['md'] = "**Hello,** \n**How are you doing?**";

return $data;
}

/* FIX TAG SPACES TESTS
*************************************************************************/

/**
* @dataProvider providerFixTagSpaces
*/
public function testFixTagSpaces($html, $md)
{
$this->assertEquals($md, $this->converter->parseString($html));
}


public function providerFixTagSpaces()
{
$data = array();
$data['strong']['html'] = "<p>This is<strong> strong</strong> text</p>";
$data['strong']['md'] = "This is **strong** text";
$data['em']['html'] = "<p>This is<em> italic </em> text</p>";
$data['em']['md'] = "This is _italic_ text";
$data['b']['html'] = "<p>Not bold, <b> bold</b></p>";
$data['b']['md'] = "Not bold, **bold**";
$data['i']['html'] = "<p>Not italic, <i>italic </i></p>";
$data['i']['md'] = "Not italic, _italic_";

return $data;
}
}

0 comments on commit 4689418

Please sign in to comment.