Merge pull request #45 from marwoodandrew/fix-span-space

[SD-4917] remove space injected with span tags, remove ascii control …
superdesk · Jul 8, 2016 · 60e1cdc · 60e1cdc
2 parents 51102c0 + 28ff383
commit 60e1cdc
Show file tree

Hide file tree

Showing 7 changed files with 144 additions and 15 deletions.
diff --git a/server/aap/publish/formatters/aap_bulletinbuilder_formatter.py b/server/aap/publish/formatters/aap_bulletinbuilder_formatter.py
@@ -87,9 +87,13 @@ def remove_tags(self, tag, tag_name):
 
     def format_text_content(self, tag):
         for child_tag in tag.find_all():
-            child_tag.replace_with(' {}'.format(child_tag.get_text().replace('\n', ' ')))
+            if child_tag.name == 'br':
+                child_tag.replace_with(' {}'.format(child_tag.get_text()))
+            else:
+                child_tag.replace_with('{}'.format(child_tag.get_text().replace('\n', ' ')))
 
         para_text = tag.get_text().strip().replace('\n', ' ').replace('\xa0', ' ')
+        para_text = re.sub('[\x00-\x1f]', '', para_text)
         if para_text != '':
             tag.replace_with('{}\r\n\r\n'.format(para_text))
         else:

diff --git a/server/aap/publish/formatters/aap_bulletinbuilder_formatter_tests.py b/server/aap/publish/formatters/aap_bulletinbuilder_formatter_tests.py
@@ -235,12 +235,12 @@ def test_body_footer(self):
 
     def test_strip_html_mixed_tags(self):
         html = '<div>This is mixed&nbsp;<span style=\\\"background-color: transparent;\\\">content' \
-               ' <p>this is para</p></div>' \
-               '<p>This is&nbsp;&nbsp;&nbsp;mixed content<div>this is para</div></p>'
+               ' <p>this is para1</p></div>' \
+               '<p>This is&nbsp;&nbsp;&nbsp;mixed content<div> this is para2</div></p>'
         formatted_content = self._formatter.get_text_content(html)
 
-        body_text = ('This is mixed content this is para\r\n\r\n'
-                     'This is mixed content this is para\r\n\r\n')
+        body_text = ('This is mixed content this is para1\r\n\r\n'
+                     'This is mixed content this is para2\r\n\r\n')
 
         self.assertEqual(formatted_content, body_text)
 

diff --git a/server/aap/publish/formatters/aap_ipnews_formatter.py b/server/aap/publish/formatters/aap_ipnews_formatter.py
@@ -91,10 +91,11 @@ def format_text_content(self, tag):
         for child_tag in tag.find_all():
             if child_tag.name == 'br':
                 child_tag.replace_with('\r\n{}'.format(child_tag.get_text()))
-            else:
-                child_tag.replace_with(' {}'.format(child_tag.get_text()))
 
-        para_text = re.sub(' +', ' ', tag.get_text().strip().replace('\n\n', ' ').replace('\xA0', ' '))
+        # remove runs os spaces and stray line feeds
+        para_text = re.sub(r' +', ' ', re.sub(r'(?<!\r)\n+', ' ', tag.get_text()).strip().replace('\xA0', ' '))
+        # remove control chars except \r and \n
+        para_text = re.sub('[\x00-\x09\x0b\x0c\x0e-\x1f]', '', para_text)
         if len(para_text) > 80:
             para_text = textwrap.fill(para_text, 80).replace('\n', ' \r\n')
         if para_text != '':

diff --git a/server/aap/publish/formatters/aap_ipnews_formatter_test.py b/server/aap/publish/formatters/aap_ipnews_formatter_test.py
@@ -266,6 +266,132 @@ def testLFContent(self):
         self.maxDiff = None
         self.assertEqual(item['article_text'].split('\x19\r\n')[11], expected)
 
+    def testStraySpaceContent(self):
+        article = {
+            '_id': '3',
+            'source': 'AAP',
+            'anpa_category': [{'qcode': 'a'}],
+            'headline': 'This is a test headline',
+            'byline': 'joe',
+            'slugline': 'slugline',
+            'subject': [{'qcode': '02011001'}],
+            'anpa_take_key': 'take_key',
+            'unique_id': '1',
+            'type': 'text',
+            'body_html': '<p><span style=\"background-color: transparent;\">\"</span>'
+                         '<span style=\"background-color: transparent;\">However</span></p>'
+                         '<p>\"<span style=\"background-color: transparent;\">The proposed</p>',
+            'word_count': '1',
+            'priority': 1,
+            "linked_in_packages": [
+                {
+                    "package": "package",
+                    "package_type": "takes"
+                }
+            ],
+        }
+        subscriber = self.app.data.find('subscribers', None, None)[0]
+
+        f = AAPIpNewsFormatter()
+        seq, item = f.format(article, subscriber)[0]
+        item = json.loads(item)
+        expected = '   "However\r\n   "The proposed\r\n\r\nAAP'
+        self.maxDiff = None
+        self.assertEqual(item['article_text'], expected)
+
+    def testNoneAsciNamesContent(self):
+        article = {
+            '_id': '3',
+            'source': 'AAP',
+            'anpa_category': [{'qcode': 'a'}],
+            'headline': 'This is a test headline',
+            'byline': 'joe',
+            'slugline': 'slugline',
+            'subject': [{'qcode': '02011001'}],
+            'anpa_take_key': 'take_key',
+            'unique_id': '1',
+            'type': 'text',
+            'body_html': '<p>Tommi Mäkinen crashes a Škoda in Äppelbo</p>',
+            'word_count': '1',
+            'priority': 1,
+            "linked_in_packages": [
+                {
+                    "package": "package",
+                    "package_type": "takes"
+                }
+            ],
+        }
+        subscriber = self.app.data.find('subscribers', None, None)[0]
+
+        f = AAPIpNewsFormatter()
+        seq, item = f.format(article, subscriber)[0]
+        item = json.loads(item)
+        expected = '   Tommi Makinen crashes a Skoda in Appelbo\r\n\r\nAAP'
+        self.maxDiff = None
+        self.assertEqual(item['article_text'], expected)
+
+    def testSpacesContent(self):
+        article = {
+            '_id': '3',
+            'source': 'AAP',
+            'anpa_category': [{'qcode': 'a'}],
+            'headline': 'This is a test headline',
+            'byline': 'joe',
+            'slugline': 'slugline',
+            'subject': [{'qcode': '02011001'}],
+            'anpa_take_key': 'take_key',
+            'unique_id': '1',
+            'type': 'text',
+            'body_html': '<p>a b  c   d&nbsp;e&nbsp;&nbsp;f\xA0g</p>',
+            'word_count': '1',
+            'priority': 1,
+            "linked_in_packages": [
+                {
+                    "package": "package",
+                    "package_type": "takes"
+                }
+            ],
+        }
+        subscriber = self.app.data.find('subscribers', None, None)[0]
+
+        f = AAPIpNewsFormatter()
+        seq, item = f.format(article, subscriber)[0]
+        item = json.loads(item)
+        expected = '   a b c d e f g\r\n\r\nAAP'
+        self.assertEqual(item['article_text'], expected)
+
+    def testControlCharsContent(self):
+        article = {
+            '_id': '3',
+            'source': 'AAP',
+            'anpa_category': [{'qcode': 'a'}],
+            'headline': 'This is a test headline',
+            'byline': 'joe',
+            'slugline': 'slugline',
+            'subject': [{'qcode': '02011001'}],
+            'anpa_take_key': 'take_key',
+            'unique_id': '1',
+            'type': 'text',
+            'body_html': '<p><span style=\"background-color: transparent;\">\u0018\u0012\f \u000b\u0012\b</span>'
+                         '<span style=\"background-color: transparent;\">\u0005\f\u0006\b \u0006\f\u0019&nbsp;</span>'
+                         '</p>',
+            'word_count': '1',
+            'priority': 1,
+            "linked_in_packages": [
+                {
+                    "package": "package",
+                    "package_type": "takes"
+                }
+            ],
+        }
+        subscriber = self.app.data.find('subscribers', None, None)[0]
+
+        f = AAPIpNewsFormatter()
+        seq, item = f.format(article, subscriber)[0]
+        item = json.loads(item)
+        expected = '     \r\n\r\nAAP'
+        self.assertEqual(item['article_text'], expected)
+
     def testMultipleCategories(self):
         article = {
             'source': 'AAP',

diff --git a/server/aap/publish/formatters/aap_newscentre_formatter.py b/server/aap/publish/formatters/aap_newscentre_formatter.py
@@ -78,10 +78,9 @@ def format_text_content(self, tag):
         for child_tag in tag.find_all():
             if child_tag.name == 'br':
                 child_tag.replace_with('\r\n{}'.format(child_tag.get_text()))
-            else:
-                child_tag.replace_with(' {}'.format(child_tag.get_text()))
 
-        para_text = re.sub(' +', ' ', tag.get_text().strip().replace('\n\n', ' ').replace('\xA0', ' '))
+        para_text = re.sub(' +', ' ', re.sub('(?<!\r)\n+', ' ', tag.get_text()).strip().replace('\xA0', ' '))
+        para_text = re.sub('[\x00-\x09\x0b\x0c\x0e-\x1f]', '', para_text)
         if para_text != '':
             tag.replace_with('   {}\r\n\r\n'.format(para_text))
         else:

diff --git a/server/aap/publish/formatters/anpa_formatter.py b/server/aap/publish/formatters/anpa_formatter.py
@@ -159,10 +159,9 @@ def format_text_content(self, tag):
         for child_tag in tag.find_all():
             if child_tag.name == 'br':
                 child_tag.replace_with('\r\n{}'.format(child_tag.get_text()))
-            else:
-                child_tag.replace_with(' {}'.format(child_tag.get_text()))
 
-        para_text = re.sub(' +', ' ', tag.get_text().strip().replace('\n\n', ' ').replace('\xA0', ' '))
+        para_text = re.sub(' +', ' ', re.sub('(?<!\r)\n+', ' ', tag.get_text()).strip().replace('\xA0', ' '))
+        para_text = re.sub('[\x00-\x09\x0b\x0c\x0e-\x1f]', '', para_text)
         if para_text != '':
             tag.replace_with('   {}\r\n'.format(para_text))
         else:

diff --git a/server/requirements.txt b/server/requirements.txt
@@ -3,4 +3,4 @@ honcho==0.6.6
 pyodbc==3.0.10
 unidecode==0.04.19
 
-git+git://github.com/superdesk/superdesk-core.git@02ed9f395e10658d1c87393f743a0b45dd94c9bb#egg=Superdesk-Core
+git+git://github.com/superdesk/superdesk-core.git@70b76c975a35c5bc7644f474a5cf9fda54753e2e#egg=Superdesk-Core