From c2602dd80ee6e22116c1613cc2022abfc5b6f616 Mon Sep 17 00:00:00 2001
From: marwoodandrew <amarwood@aap.com.au>
Date: Thu, 8 Sep 2016 14:21:47 +1000
Subject: [PATCH] fix(formatters) Top level breaks cause badly formatted output

---
 .../formatters/aap_ipnews_formatter.py        |  3 +-
 .../formatters/aap_ipnews_formatter_test.py   | 61 +++++++++++++++++++
 .../formatters/aap_newscentre_formatter.py    |  1 +
 .../aap/publish/formatters/anpa_formatter.py  |  1 +
 4 files changed, 65 insertions(+), 1 deletion(-)
diff --git a/server/aap/publish/formatters/aap_ipnews_formatter.py b/server/aap/publish/formatters/aap_ipnews_formatter.py
index 3d2a31438..d4a4cd035 100644
--- a/server/aap/publish/formatters/aap_ipnews_formatter.py
+++ b/server/aap/publish/formatters/aap_ipnews_formatter.py
@@ -79,7 +79,7 @@ def format_for_source(self, article, subscriber, source, codes=None):
                     odbc_item['article_text'] += ' ' + sign_off
 
                 odbc_item['service_level'] = 'a'  # @service_level
-                odbc_item['wordcount'] = article.get('word_count', None)  # @wordcount
+                odbc_item['wordcount'] = article.get('word_count') or 0   # @wordcount
                 odbc_item['priority'] = map_priority(article.get('priority'))  # @priority
 
                 docs.append((pub_seq_num, json.dumps(odbc_item)))
@@ -93,6 +93,7 @@ def get_wrapped_text_content(self, content):
         :param content:
         :return:
         """
+        content = content.replace('<br>', '<br/>').replace('</br>', '')
         soup = BeautifulSoup(content, 'html.parser')
 
         for top_level_tag in soup.find_all(recursive=False):
diff --git a/server/aap/publish/formatters/aap_ipnews_formatter_test.py b/server/aap/publish/formatters/aap_ipnews_formatter_test.py
index fc185843e..28ce62044 100644
--- a/server/aap/publish/formatters/aap_ipnews_formatter_test.py
+++ b/server/aap/publish/formatters/aap_ipnews_formatter_test.py
@@ -685,6 +685,67 @@ def test_aap_ipnews_formatter_with_body_formatted(self):
                               'selector_codes': 'Axx',
                               'genre': 'Current', 'keyword': 'slugline', 'author': 'joe'})
 
+    def testAdvisoryWithBreaksContent(self):
+        article = {
+            '_id': '3',
+            'source': 'AAP',
+            'anpa_category': [{'qcode': 'a'}],
+            'headline': 'This is a test headline',
+            'byline': 'joe',
+            'slugline': 'slugline',
+            'subject': [{'qcode': '02011001'}],
+            'anpa_take_key': 'take_key',
+            'unique_id': '1',
+            'type': 'text',
+            'body_html': '<p>Economy</p><p>The latest national accounts.<br></p><p>Farm<br></p><p>If you ask Treasurer'
+                         '</p><br><p>Turnbull Howard<br></p><p>Former prime minister John Howard believes </p>',
+            'word_count': '1',
+            'priority': 1,
+            "linked_in_packages": [
+                {
+                    "package": "package",
+                    "package_type": "takes"
+                }
+            ],
+        }
+        subscriber = self.app.data.find('subscribers', None, None)[0]
+
+        f = AAPIpNewsFormatter()
+        seq, item = f.format(article, subscriber)[0]
+        item = json.loads(item)
+        expected = '   Economy\r\n   The latest national accounts.\r\n   Farm\r\n   If you ask Treasurer\r\n   ' \
+            'Turnbull Howard\r\n   Former prime minister John Howard believes\r\n\r\nAAP'
+        self.assertEqual(item['article_text'], expected)
+
+    def testNullWordCount(self):
+        article = {
+            '_id': '3',
+            'source': 'AAP',
+            'anpa_category': [{'qcode': 'a'}],
+            'headline': 'This is a test headline',
+            'byline': 'joe',
+            'slugline': 'slugline',
+            'subject': [{'qcode': '02011001'}],
+            'anpa_take_key': 'take_key',
+            'unique_id': '1',
+            'type': 'text',
+            'body_html': '<p>Test</p>',
+            'word_count': None,
+            'priority': 1,
+            "linked_in_packages": [
+                {
+                    "package": "package",
+                    "package_type": "takes"
+                }
+            ],
+        }
+        subscriber = self.app.data.find('subscribers', None, None)[0]
+
+        f = AAPIpNewsFormatter()
+        seq, item = f.format(article, subscriber)[0]
+        item = json.loads(item)
+        self.assertEqual(item['wordcount'], 0)
+
 
 class DefaultSubjectTest(SuperdeskTestCase):
 
diff --git a/server/aap/publish/formatters/aap_newscentre_formatter.py b/server/aap/publish/formatters/aap_newscentre_formatter.py
index 8e3c60aa7..76969baf7 100644
--- a/server/aap/publish/formatters/aap_newscentre_formatter.py
+++ b/server/aap/publish/formatters/aap_newscentre_formatter.py
@@ -77,6 +77,7 @@ def _get_category_list(self, category_list):
         return get_aap_category_list(category_list)
 
     def get_text_content(self, content):
+        content = content.replace('<br>', '<br/>').replace('</br>', '')
         soup = BeautifulSoup(content, 'html.parser')
 
         for top_level_tag in soup.find_all(recursive=False):
diff --git a/server/aap/publish/formatters/anpa_formatter.py b/server/aap/publish/formatters/anpa_formatter.py
index 3c8fd2599..62167d507 100644
--- a/server/aap/publish/formatters/anpa_formatter.py
+++ b/server/aap/publish/formatters/anpa_formatter.py
@@ -154,6 +154,7 @@ def format(self, article, subscriber, codes=None):
             raise FormatterError.AnpaFormatterError(ex, subscriber)
 
     def get_text_content(self, content):
+        content = content.replace('<br>', '<br/>').replace('</br>', '')
         soup = BeautifulSoup(content, 'html.parser')
 
         for top_level_tag in soup.find_all(recursive=False):