Skip to content

Commit

Permalink
Merge pull request #45 from marwoodandrew/fix-span-space
Browse files Browse the repository at this point in the history
[SD-4917] remove space injected with span tags, remove ascii control …
  • Loading branch information
akintolga authored Jul 8, 2016
2 parents 51102c0 + 28ff383 commit 60e1cdc
Show file tree
Hide file tree
Showing 7 changed files with 144 additions and 15 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -87,9 +87,13 @@ def remove_tags(self, tag, tag_name):

def format_text_content(self, tag):
for child_tag in tag.find_all():
child_tag.replace_with(' {}'.format(child_tag.get_text().replace('\n', ' ')))
if child_tag.name == 'br':
child_tag.replace_with(' {}'.format(child_tag.get_text()))
else:
child_tag.replace_with('{}'.format(child_tag.get_text().replace('\n', ' ')))

para_text = tag.get_text().strip().replace('\n', ' ').replace('\xa0', ' ')
para_text = re.sub('[\x00-\x1f]', '', para_text)
if para_text != '':
tag.replace_with('{}\r\n\r\n'.format(para_text))
else:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -235,12 +235,12 @@ def test_body_footer(self):

def test_strip_html_mixed_tags(self):
html = '<div>This is mixed&nbsp;<span style=\\\"background-color: transparent;\\\">content' \
' <p>this is para</p></div>' \
'<p>This is&nbsp;&nbsp;&nbsp;mixed content<div>this is para</div></p>'
' <p>this is para1</p></div>' \
'<p>This is&nbsp;&nbsp;&nbsp;mixed content<div> this is para2</div></p>'
formatted_content = self._formatter.get_text_content(html)

body_text = ('This is mixed content this is para\r\n\r\n'
'This is mixed content this is para\r\n\r\n')
body_text = ('This is mixed content this is para1\r\n\r\n'
'This is mixed content this is para2\r\n\r\n')

self.assertEqual(formatted_content, body_text)

Expand Down
7 changes: 4 additions & 3 deletions server/aap/publish/formatters/aap_ipnews_formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,10 +91,11 @@ def format_text_content(self, tag):
for child_tag in tag.find_all():
if child_tag.name == 'br':
child_tag.replace_with('\r\n{}'.format(child_tag.get_text()))
else:
child_tag.replace_with(' {}'.format(child_tag.get_text()))

para_text = re.sub(' +', ' ', tag.get_text().strip().replace('\n\n', ' ').replace('\xA0', ' '))
# remove runs os spaces and stray line feeds
para_text = re.sub(r' +', ' ', re.sub(r'(?<!\r)\n+', ' ', tag.get_text()).strip().replace('\xA0', ' '))
# remove control chars except \r and \n
para_text = re.sub('[\x00-\x09\x0b\x0c\x0e-\x1f]', '', para_text)
if len(para_text) > 80:
para_text = textwrap.fill(para_text, 80).replace('\n', ' \r\n')
if para_text != '':
Expand Down
126 changes: 126 additions & 0 deletions server/aap/publish/formatters/aap_ipnews_formatter_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,132 @@ def testLFContent(self):
self.maxDiff = None
self.assertEqual(item['article_text'].split('\x19\r\n')[11], expected)

def testStraySpaceContent(self):
article = {
'_id': '3',
'source': 'AAP',
'anpa_category': [{'qcode': 'a'}],
'headline': 'This is a test headline',
'byline': 'joe',
'slugline': 'slugline',
'subject': [{'qcode': '02011001'}],
'anpa_take_key': 'take_key',
'unique_id': '1',
'type': 'text',
'body_html': '<p><span style=\"background-color: transparent;\">\"</span>'
'<span style=\"background-color: transparent;\">However</span></p>'
'<p>\"<span style=\"background-color: transparent;\">The proposed</p>',
'word_count': '1',
'priority': 1,
"linked_in_packages": [
{
"package": "package",
"package_type": "takes"
}
],
}
subscriber = self.app.data.find('subscribers', None, None)[0]

f = AAPIpNewsFormatter()
seq, item = f.format(article, subscriber)[0]
item = json.loads(item)
expected = ' "However\r\n "The proposed\r\n\r\nAAP'
self.maxDiff = None
self.assertEqual(item['article_text'], expected)

def testNoneAsciNamesContent(self):
article = {
'_id': '3',
'source': 'AAP',
'anpa_category': [{'qcode': 'a'}],
'headline': 'This is a test headline',
'byline': 'joe',
'slugline': 'slugline',
'subject': [{'qcode': '02011001'}],
'anpa_take_key': 'take_key',
'unique_id': '1',
'type': 'text',
'body_html': '<p>Tommi Mäkinen crashes a Škoda in Äppelbo</p>',
'word_count': '1',
'priority': 1,
"linked_in_packages": [
{
"package": "package",
"package_type": "takes"
}
],
}
subscriber = self.app.data.find('subscribers', None, None)[0]

f = AAPIpNewsFormatter()
seq, item = f.format(article, subscriber)[0]
item = json.loads(item)
expected = ' Tommi Makinen crashes a Skoda in Appelbo\r\n\r\nAAP'
self.maxDiff = None
self.assertEqual(item['article_text'], expected)

def testSpacesContent(self):
article = {
'_id': '3',
'source': 'AAP',
'anpa_category': [{'qcode': 'a'}],
'headline': 'This is a test headline',
'byline': 'joe',
'slugline': 'slugline',
'subject': [{'qcode': '02011001'}],
'anpa_take_key': 'take_key',
'unique_id': '1',
'type': 'text',
'body_html': '<p>a b c d&nbsp;e&nbsp;&nbsp;f\xA0g</p>',
'word_count': '1',
'priority': 1,
"linked_in_packages": [
{
"package": "package",
"package_type": "takes"
}
],
}
subscriber = self.app.data.find('subscribers', None, None)[0]

f = AAPIpNewsFormatter()
seq, item = f.format(article, subscriber)[0]
item = json.loads(item)
expected = ' a b c d e f g\r\n\r\nAAP'
self.assertEqual(item['article_text'], expected)

def testControlCharsContent(self):
article = {
'_id': '3',
'source': 'AAP',
'anpa_category': [{'qcode': 'a'}],
'headline': 'This is a test headline',
'byline': 'joe',
'slugline': 'slugline',
'subject': [{'qcode': '02011001'}],
'anpa_take_key': 'take_key',
'unique_id': '1',
'type': 'text',
'body_html': '<p><span style=\"background-color: transparent;\">\u0018\u0012\f \u000b\u0012\b</span>'
'<span style=\"background-color: transparent;\">\u0005\f\u0006\b \u0006\f\u0019&nbsp;</span>'
'</p>',
'word_count': '1',
'priority': 1,
"linked_in_packages": [
{
"package": "package",
"package_type": "takes"
}
],
}
subscriber = self.app.data.find('subscribers', None, None)[0]

f = AAPIpNewsFormatter()
seq, item = f.format(article, subscriber)[0]
item = json.loads(item)
expected = ' \r\n\r\nAAP'
self.assertEqual(item['article_text'], expected)

def testMultipleCategories(self):
article = {
'source': 'AAP',
Expand Down
5 changes: 2 additions & 3 deletions server/aap/publish/formatters/aap_newscentre_formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,10 +78,9 @@ def format_text_content(self, tag):
for child_tag in tag.find_all():
if child_tag.name == 'br':
child_tag.replace_with('\r\n{}'.format(child_tag.get_text()))
else:
child_tag.replace_with(' {}'.format(child_tag.get_text()))

para_text = re.sub(' +', ' ', tag.get_text().strip().replace('\n\n', ' ').replace('\xA0', ' '))
para_text = re.sub(' +', ' ', re.sub('(?<!\r)\n+', ' ', tag.get_text()).strip().replace('\xA0', ' '))
para_text = re.sub('[\x00-\x09\x0b\x0c\x0e-\x1f]', '', para_text)
if para_text != '':
tag.replace_with(' {}\r\n\r\n'.format(para_text))
else:
Expand Down
5 changes: 2 additions & 3 deletions server/aap/publish/formatters/anpa_formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,10 +159,9 @@ def format_text_content(self, tag):
for child_tag in tag.find_all():
if child_tag.name == 'br':
child_tag.replace_with('\r\n{}'.format(child_tag.get_text()))
else:
child_tag.replace_with(' {}'.format(child_tag.get_text()))

para_text = re.sub(' +', ' ', tag.get_text().strip().replace('\n\n', ' ').replace('\xA0', ' '))
para_text = re.sub(' +', ' ', re.sub('(?<!\r)\n+', ' ', tag.get_text()).strip().replace('\xA0', ' '))
para_text = re.sub('[\x00-\x09\x0b\x0c\x0e-\x1f]', '', para_text)
if para_text != '':
tag.replace_with(' {}\r\n'.format(para_text))
else:
Expand Down
2 changes: 1 addition & 1 deletion server/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ honcho==0.6.6
pyodbc==3.0.10
unidecode==0.04.19

git+git://github.com/superdesk/superdesk-core.git@02ed9f395e10658d1c87393f743a0b45dd94c9bb#egg=Superdesk-Core
git+git://github.com/superdesk/superdesk-core.git@70b76c975a35c5bc7644f474a5cf9fda54753e2e#egg=Superdesk-Core

0 comments on commit 60e1cdc

Please sign in to comment.