From c2602dd80ee6e22116c1613cc2022abfc5b6f616 Mon Sep 17 00:00:00 2001 From: marwoodandrew Date: Thu, 8 Sep 2016 14:21:47 +1000 Subject: [PATCH] fix(formatters) Top level breaks cause badly formatted output --- .../formatters/aap_ipnews_formatter.py | 3 +- .../formatters/aap_ipnews_formatter_test.py | 61 +++++++++++++++++++ .../formatters/aap_newscentre_formatter.py | 1 + .../aap/publish/formatters/anpa_formatter.py | 1 + 4 files changed, 65 insertions(+), 1 deletion(-) diff --git a/server/aap/publish/formatters/aap_ipnews_formatter.py b/server/aap/publish/formatters/aap_ipnews_formatter.py index 3d2a31438..d4a4cd035 100644 --- a/server/aap/publish/formatters/aap_ipnews_formatter.py +++ b/server/aap/publish/formatters/aap_ipnews_formatter.py @@ -79,7 +79,7 @@ def format_for_source(self, article, subscriber, source, codes=None): odbc_item['article_text'] += ' ' + sign_off odbc_item['service_level'] = 'a' # @service_level - odbc_item['wordcount'] = article.get('word_count', None) # @wordcount + odbc_item['wordcount'] = article.get('word_count') or 0 # @wordcount odbc_item['priority'] = map_priority(article.get('priority')) # @priority docs.append((pub_seq_num, json.dumps(odbc_item))) @@ -93,6 +93,7 @@ def get_wrapped_text_content(self, content): :param content: :return: """ + content = content.replace('
', '
').replace('
', '') soup = BeautifulSoup(content, 'html.parser') for top_level_tag in soup.find_all(recursive=False): diff --git a/server/aap/publish/formatters/aap_ipnews_formatter_test.py b/server/aap/publish/formatters/aap_ipnews_formatter_test.py index fc185843e..28ce62044 100644 --- a/server/aap/publish/formatters/aap_ipnews_formatter_test.py +++ b/server/aap/publish/formatters/aap_ipnews_formatter_test.py @@ -685,6 +685,67 @@ def test_aap_ipnews_formatter_with_body_formatted(self): 'selector_codes': 'Axx', 'genre': 'Current', 'keyword': 'slugline', 'author': 'joe'}) + def testAdvisoryWithBreaksContent(self): + article = { + '_id': '3', + 'source': 'AAP', + 'anpa_category': [{'qcode': 'a'}], + 'headline': 'This is a test headline', + 'byline': 'joe', + 'slugline': 'slugline', + 'subject': [{'qcode': '02011001'}], + 'anpa_take_key': 'take_key', + 'unique_id': '1', + 'type': 'text', + 'body_html': '

Economy

The latest national accounts.

Farm

If you ask Treasurer' + '


Turnbull Howard

Former prime minister John Howard believes

', + 'word_count': '1', + 'priority': 1, + "linked_in_packages": [ + { + "package": "package", + "package_type": "takes" + } + ], + } + subscriber = self.app.data.find('subscribers', None, None)[0] + + f = AAPIpNewsFormatter() + seq, item = f.format(article, subscriber)[0] + item = json.loads(item) + expected = ' Economy\r\n The latest national accounts.\r\n Farm\r\n If you ask Treasurer\r\n ' \ + 'Turnbull Howard\r\n Former prime minister John Howard believes\r\n\r\nAAP' + self.assertEqual(item['article_text'], expected) + + def testNullWordCount(self): + article = { + '_id': '3', + 'source': 'AAP', + 'anpa_category': [{'qcode': 'a'}], + 'headline': 'This is a test headline', + 'byline': 'joe', + 'slugline': 'slugline', + 'subject': [{'qcode': '02011001'}], + 'anpa_take_key': 'take_key', + 'unique_id': '1', + 'type': 'text', + 'body_html': '

Test

', + 'word_count': None, + 'priority': 1, + "linked_in_packages": [ + { + "package": "package", + "package_type": "takes" + } + ], + } + subscriber = self.app.data.find('subscribers', None, None)[0] + + f = AAPIpNewsFormatter() + seq, item = f.format(article, subscriber)[0] + item = json.loads(item) + self.assertEqual(item['wordcount'], 0) + class DefaultSubjectTest(SuperdeskTestCase): diff --git a/server/aap/publish/formatters/aap_newscentre_formatter.py b/server/aap/publish/formatters/aap_newscentre_formatter.py index 8e3c60aa7..76969baf7 100644 --- a/server/aap/publish/formatters/aap_newscentre_formatter.py +++ b/server/aap/publish/formatters/aap_newscentre_formatter.py @@ -77,6 +77,7 @@ def _get_category_list(self, category_list): return get_aap_category_list(category_list) def get_text_content(self, content): + content = content.replace('
', '
').replace('
', '') soup = BeautifulSoup(content, 'html.parser') for top_level_tag in soup.find_all(recursive=False): diff --git a/server/aap/publish/formatters/anpa_formatter.py b/server/aap/publish/formatters/anpa_formatter.py index 3c8fd2599..62167d507 100644 --- a/server/aap/publish/formatters/anpa_formatter.py +++ b/server/aap/publish/formatters/anpa_formatter.py @@ -154,6 +154,7 @@ def format(self, article, subscriber, codes=None): raise FormatterError.AnpaFormatterError(ex, subscriber) def get_text_content(self, content): + content = content.replace('
', '
').replace('
', '') soup = BeautifulSoup(content, 'html.parser') for top_level_tag in soup.find_all(recursive=False):