diff --git a/server/aap/publish/formatters/aap_bulletinbuilder_formatter.py b/server/aap/publish/formatters/aap_bulletinbuilder_formatter.py
index 9f88366c5..743b60e7f 100644
--- a/server/aap/publish/formatters/aap_bulletinbuilder_formatter.py
+++ b/server/aap/publish/formatters/aap_bulletinbuilder_formatter.py
@@ -70,7 +70,7 @@ def format(self, article, subscriber, codes=None):
formatted_article['abstract'] = self.get_text_content(
to_ascii(formatted_article.get('abstract', '') or '')).strip()
formatted_article['headline'] = self.get_text_content(
- to_ascii(formatted_article.get('headline', ''))).strip()
+ to_ascii(formatted_article.get('headline', '')), space_on_elements=False).strip()
formatted_article['byline'] = self.get_text_content(
to_ascii(formatted_article.get('byline', '') or '')).strip()
@@ -116,7 +116,7 @@ def format(self, article, subscriber, codes=None):
def can_format(self, format_type, article):
return format_type == 'AAP BULLETIN BUILDER'
- def get_text_content(self, content):
+ def get_text_content(self, content, space_on_elements=True):
content = content.replace('
', '
').replace('', '')
# remove control chars except \n
content = re.sub('[\x00-\x09\x0b-\x1f]', '', content)
@@ -125,7 +125,7 @@ def get_text_content(self, content):
if content == '':
return ''
- parsed = parse_html(content, content='html', space_on_elements=True)
+ parsed = parse_html(content, content='html', space_on_elements=space_on_elements)
# breaks are replaced with spaces
for br in parsed.xpath('//br'):
diff --git a/server/aap/publish/formatters/aap_bulletinbuilder_formatter_tests.py b/server/aap/publish/formatters/aap_bulletinbuilder_formatter_tests.py
index b3973de98..accc47a0e 100644
--- a/server/aap/publish/formatters/aap_bulletinbuilder_formatter_tests.py
+++ b/server/aap/publish/formatters/aap_bulletinbuilder_formatter_tests.py
@@ -786,3 +786,59 @@ def test_embedded_item(self):
self.assertGreater(int(seq), 0)
test_article = json.loads(item.get('data'))
self.assertEqual(test_article['body_html'], '
pre amble
post amble
') + + def test_clean_headline_html(self): + article = { + config.ID_FIELD: '123', + config.VERSION: 2, + 'source': 'AAP', + 'headline': '1234567890123456789012345123456789012345678901234567890', + 'slugline': 'slugline', + 'abstract': 'abstract
', + 'type': 'text', + 'anpa_category': [{'qcode': 'a', 'name': 'Australian General News'}], + 'flags': { + 'marked_for_legal': True + }, + 'body_html': ('The story
'), + "fields_meta": { + "headline": { + "draftjsState": [ + { + "blocks": [ + { + "key": "2fvvl", + "text": "1234567890123456789012345123456789012345678901234567890", + "type": "unstyled", + "depth": 0, + "inlineStyleRanges": [ + { + "offset": 0, + "length": 55, + "style": "BOLD" + }, + { + "offset": 54, + "length": 1, + "style": "LIMIT_CHARACTERS_OVERFLOW" + } + ], + "entityRanges": [], + "data": { + "MULTIPLE_HIGHLIGHTS": {} + } + } + ], + "entityMap": {} + } + ] + } + } + } + + subscriber = self.app.data.find('subscribers', None, None)[0][0] + seq, item = self._formatter.format(article, subscriber)[0] + item = json.loads(item) + self.assertGreater(int(seq), 0) + test_article = json.loads(item.get('data')) + self.assertEqual(test_article['headline'], '1234567890123456789012345123456789012345678901234567890')