From 66f45d9cb78ddecccbab35f386be4dacc5cf3844 Mon Sep 17 00:00:00 2001 From: marwoodandrew Date: Mon, 30 Sep 2024 11:37:07 +1000 Subject: [PATCH] SDAAP-122 Handle fancy quotes in ANPA --- .../aap/publish/formatters/anpa_formatter.py | 12 +++++++----- .../publish/formatters/anpa_formatter_test.py | 18 ++++++++++++++++++ 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/server/aap/publish/formatters/anpa_formatter.py b/server/aap/publish/formatters/anpa_formatter.py index ba609a43a..956d910a8 100644 --- a/server/aap/publish/formatters/anpa_formatter.py +++ b/server/aap/publish/formatters/anpa_formatter.py @@ -17,7 +17,7 @@ from .field_mappers.locator_mapper import LocatorMapper from .field_mappers.slugline_mapper import SluglineMapper from eve.utils import config -from .unicodetoascii import to_ascii +from .unicodetoascii import to_ascii, clean_string from .category_list_map import get_aap_category_list import re from superdesk.etree import parse_html, to_string, etree @@ -104,7 +104,7 @@ def format(self, article, subscriber, codes=None): anpa.append(b'\x0D\x0A') if formatted_article.get('ednote', '') != '': - ednote = '{}\r\n'.format(to_ascii(formatted_article.get('ednote'))) + ednote = '{}\r\n'.format(to_ascii(clean_string(formatted_article.get('ednote')))) anpa.append(ednote.encode('ascii', 'replace')) if formatted_article.get(BYLINE): @@ -115,7 +115,7 @@ def format(self, article, subscriber, codes=None): anpa.append(get_text(self.append_body_footer(formatted_article), content='html').encode('ascii', 'replace')) else: - body = to_ascii(formatted_article.get('body_html', '')) + body = to_ascii(clean_string(formatted_article.get('body_html', ''))) # we need to inject the dateline if formatted_article.get('dateline', {}).get('text') and not article.get('auto_publish', False): body_html_elem = parse_html(formatted_article.get('body_html')) @@ -125,7 +125,8 @@ def format(self, article, subscriber, codes=None): body = to_string(body_html_elem) anpa.append(self.get_text_content(body)) if formatted_article.get('body_footer'): - anpa.append(self.get_text_content(to_ascii(formatted_article.get('body_footer', '')))) + anpa.append( + self.get_text_content(to_ascii(clean_string(formatted_article.get('body_footer', ''))))) anpa.append(b'\x0D\x0A') anpa.append(mapped_source.encode('ascii')) @@ -171,7 +172,8 @@ def get_text_content(self, content): def _process_headline(self, anpa, article, category): # prepend the locator to the headline if required article['headline'] = get_text(article.get('headline', '')) - headline = to_ascii(LocatorMapper().get_formatted_headline(article, category.decode('UTF-8').upper())) + headline = to_ascii( + clean_string(LocatorMapper().get_formatted_headline(article, category.decode('UTF-8').upper()))) # Set the maximum size to 64 including the sequence number if any if len(headline) > 64: diff --git a/server/aap/publish/formatters/anpa_formatter_test.py b/server/aap/publish/formatters/anpa_formatter_test.py index a9224fca5..e86ba856b 100644 --- a/server/aap/publish/formatters/anpa_formatter_test.py +++ b/server/aap/publish/formatters/anpa_formatter_test.py @@ -388,6 +388,24 @@ def test_preformated(self): self.assertTrue(lines.getvalue().split('\r')[3].lstrip(), 'Test line 1') self.assertTrue(lines.getvalue().split('\r')[4], 'Test line 2') + def test_fancy_quotes(self): + f = AAPAnpaFormatter() + subscriber = self.app.data.find('subscribers', None, None)[0][0] + item = self.article.copy() + item.update({ + 'body_html': "

\"quoted”

" + "

“In“ IBAC’s

" + "

Short hyphen­not handled fix one day!

" + "

“Then ‘You can’t have it’,\"

", + 'format': 'html'}) + resp = f.format(item, subscriber)[0] + out = resp['encoded_item'] + + lines = io.StringIO(out.decode()) + self.assertTrue(lines.getvalue().split('\r')[3].lstrip(), '"quoted"') + self.assertTrue(lines.getvalue().split('\r')[4].lstrip(), '"In" IBAC\'s') + self.assertTrue(lines.getvalue().split('\r')[6].lstrip(), '"Then \'You can\'t have it\'," ') + def test_embed_in_body_body(self): f = AAPAnpaFormatter() subscriber = self.app.data.find('subscribers', None, None)[0][0]