diff --git a/server/aap/publish/formatters/aap_apple_news_formatter.py b/server/aap/publish/formatters/aap_apple_news_formatter.py index 261337da9..d54c02ecd 100644 --- a/server/aap/publish/formatters/aap_apple_news_formatter.py +++ b/server/aap/publish/formatters/aap_apple_news_formatter.py @@ -10,33 +10,33 @@ import logging import json -import re from datetime import datetime +from pytz import timezone +from superdesk.utc import get_date from copy import deepcopy -from eve.utils import ParsedRequest, config +from eve.utils import config +import lxml.html as lxml_html +from draftjs_exporter.dom import DOM +from textwrap import dedent +from urllib.parse import urlparse, unquote from superdesk.publish.formatters import Formatter -from superdesk.metadata.item import FORMAT, FORMATS, ITEM_STATE, CONTENT_STATE +from superdesk.metadata.item import FORMAT, FORMATS from superdesk import get_resource_service from superdesk.utils import json_serialize_datetime_objectId from superdesk.utc import utc_to_local -from superdesk.etree import parse_html, to_string from superdesk.text_utils import get_text -from aap.text_utils import format_text_content -from aap.utils import is_fact_check +from superdesk.editor_utils import get_content_state_fields, Editor3Content, DraftJSHTMLExporter, render_fragment from aap.errors import AppleNewsError - logger = logging.getLogger(__name__) class AAPAppleNewsFormatter(Formatter): - name = 'AAP Apple News' type = 'AAP Apple News' APPLE_NEWS_VERSION = '1.8' - URL_REGEX = re.compile(r'(?:(?:https|http)://)[\w/\-?=%.]+\.[\w/\-?=%#@.\+:]+', re.IGNORECASE) def __init__(self): self.format_type = 'AAP Apple News' @@ -52,44 +52,123 @@ def format(self, article, subscriber, codes=None): except Exception as ex: raise AppleNewsError.AppleNewsFormatter(exception=ex) - def _format(self, article): - apple_news = {} - self._parse_content(article) - if not article.get('_title') or not article.get('_analysis_first_line') or not article.get('_analysis') \ - or not article.get('_statement') or not article.get('_statement_attribution') or \ - not article.get('_verdict1') or not article.get('_verdict2') or not article.get('_references'): - missing_fields = { - 'title': True if article.get('_title') else False, - 'subtitle': True if article.get('_analysis_first_line') else False, - 'analysis': True if article.get('_analysis') else False, - 'statement': True if article.get('_statement') else False, - 'statement_attribution': True if article.get('_statement_attribution') else False, - 'verdict1': True if article.get('_verdict1') else False, - 'verdict2': True if article.get('_verdict2') else False, - 'references': True if article.get('_references') else False, - } + def _filter_blocks(self, item, field, bfilter, remove): + """ + Function to filter the embed blocks for video and audio and also will regenerate the html in a more friendly + form using the AppleExporter class + :param item: The article + :param field: the field to operate on + :param bfilter: Filter function to determine if the block is to be kept + :param remove: list of keys to remove + :return: + """ + editor = Editor3Content(item, field, True) + exporter = AppleExporter(editor) + editor.html_exporter = exporter + blocks = [] + for block in editor.blocks: + if bfilter(block, remove): + blocks.append(block) + editor.set_blocks(blocks) + editor.update_item() + + def _not_embed(self, block, remove): + if block.type.lower() == "atomic": + bk = [e.key for e in block.entities if e.key in remove] + if bk: + return False + return True + + def _remove_embeds(self, article, remove_keys): + """ + Removes the nominated embeds from the draftjs state and regenerates the HTML. + :param article: + :param remove_keys + :return: + """ + to_remove = [k.lstrip("editor_") for k in remove_keys] + fields = get_content_state_fields(article) + for field in fields: + self._filter_blocks(article, field, self._not_embed, to_remove) + + for key in remove_keys: + article.get("associations", {}).pop(key, None) + if article.get("refs") is not None: + article["refs"] = [r for r in article.get("refs", []) if r["key"] != key] + + def _remove_unwanted_embeds(self, article): + """ + Removes all embeds that are not images/pictures + :param article: + :return: + """ + remove_keys = [] + + # can only handle pictures at the moment + for key, item in (article.get("associations") or {}).items(): + if key.startswith("editor_") and item.get("type") != 'picture': + remove_keys.append(key) + + self._remove_embeds(article, remove_keys) + + def format_dateline(self, located, current_timestamp): + """ + Formats dateline to "Location, Month Date Source -" + + :return: formatted dateline string + """ - logger.warning('Failed to parse title for item: {}. ' - 'missing fields: {}'.format(article.get('item_id'), missing_fields)) + dateline_location = "{city_code}" + dateline_location_format_fields = located.get("dateline", "city") + dateline_location_format_fields = dateline_location_format_fields.split(",") + if "country" in dateline_location_format_fields and "state" in dateline_location_format_fields: + dateline_location = "{city_code}, {state_code}, {country_code}" + elif "state" in dateline_location_format_fields: + dateline_location = "{city_code}, {state_code}" + elif "country" in dateline_location_format_fields: + dateline_location = "{city_code}, {country_code}" + dateline_location = dateline_location.format(**located) + + if located.get("tz") and located["tz"] != "UTC": + current_timestamp = datetime.fromtimestamp(current_timestamp.timestamp(), tz=timezone(located["tz"])) + else: + current_timestamp = utc_to_local(config.DEFAULT_TIMEZONE, current_timestamp) + if current_timestamp.month == 9: + formatted_date = "Sept {}".format(current_timestamp.strftime("%-d")) + elif 3 <= current_timestamp.month <= 7: + formatted_date = current_timestamp.strftime("%B %-d") + else: + formatted_date = current_timestamp.strftime("%b %-d") + + return "{location}, {mmmdd} at {hhmmpa}".format( + location=dateline_location.upper(), mmmdd=formatted_date, hhmmpa=current_timestamp.strftime('%I:%M%p') + ) - raise Exception('Cannot format the article for Apple News. ' - 'Failed to parse the item: {}.'.format(article.get('item_id'))) + def _format(self, article): + # Remove any video or audio embeds since for apple news they must be externally hosted + self._remove_unwanted_embeds(article) + + apple_news = {} self._set_article_document(apple_news, article) + + # Set the associations for the transmitter to be able to get the binaries + apple_news['associations'] = article.get('associations', {}) return apple_news def can_format(self, format_type, article): """Can format text article that are not preformatted""" - return format_type == self.format_type and is_fact_check(article) \ - and article.get(FORMAT) == FORMATS.HTML + return format_type == self.format_type and article.get(FORMAT) == FORMATS.HTML def _set_advertising_settings(self, apple_news): - """Function to set the adversiting settings""" - apple_news['advertisingSettings'] = { - 'frequency': 5, - 'layout': { - 'margin': { - 'bottom': 15, - 'top': 15 + """Function to set the advertising settings""" + apple_news['autoplacement'] = { + "advertisement": { + "enabled": True, + "bannerType": "any", + "distanceFromMedia": "10vh", + "frequency": 10, + "layout": { + "margin": 10 } } } @@ -100,7 +179,7 @@ def _is_featuremedia_exists(self, article): def _set_language(self, apple_news, article): """Set language""" - apple_news['language'] = article.get('language') or 'en' + apple_news['language'] = 'en-AU' if article.get('language') == 'en' else article.get('language', 'en-AU') def _set_document_style(self, apple_news): """Set document style""" @@ -111,9 +190,8 @@ def _set_article_document(self, apple_news, article): self._set_language(apple_news, article) self._set_metadata(apple_news, article) apple_news['identifier'] = article['item_id'] - apple_news['title'] = article.get('_title') + apple_news['title'] = article.get('headline') apple_news['version'] = self.APPLE_NEWS_VERSION - apple_news['subtitle'] = article.get('_analysis_first_line') self._set_layout(apple_news) self._set_advertising_settings(apple_news) self._set_component_layouts(apple_news) @@ -126,13 +204,19 @@ def _set_metadata(self, apple_news, article): 'dateCreated': self._format_datetime(article.get('firstcreated')), 'datePublished': self._format_datetime(article.get('firstpublished')), 'dateModified': self._format_datetime(article.get('versioncreated')), - 'excerpt': article.get('_title') + 'excerpt': get_text(article.get('abstract', ''), content='html').strip() } + if article.get('byline'): + apple_news['metadata']['authors'] = [article.get('byline')] if self._is_featuremedia_exists(article): - apple_news['metadata']['thumbnailURL'] = 'bundle://header.jpg' + apple_news['metadata']['thumbnailURL'] = 'bundle://featuremedia' - def _format_datetime(self, article_date, date_format='%Y-%m-%dT%H:%M:%S%z'): - return datetime.strftime(utc_to_local(config.DEFAULT_TIMEZONE, article_date), date_format) + def _format_datetime(self, article_date, date_format=None): + if date_format is None: + aware_dt = article_date.astimezone() + return aware_dt.isoformat(timespec='seconds') + else: + return datetime.strftime(utc_to_local(config.DEFAULT_TIMEZONE, article_date), date_format) def _set_layout(self, apple_news): """Set Layout""" @@ -146,71 +230,60 @@ def _set_layout(self, apple_news): def _set_component_layouts(self, apple_news): apple_news['componentLayouts'] = { "bodyLayout": { - "columnSpan": 6, + "columnSpan": 7, "columnStart": 0, "margin": { "bottom": 15, "top": 15 } }, - "claimTagLayout": { - "columnSpan": 7, - "columnStart": 0 - }, "fixed_image_header_container": { "columnSpan": 7, "columnStart": 0, "ignoreDocumentMargin": True, "minimumHeight": "45vh" }, - "fixed_image_header_section": { - "ignoreDocumentMargin": True, - "margin": { - "bottom": 0, - "top": 40 - } - }, - "header-top-spacer": { - "minimumHeight": 30 - }, - "statementAttributionLayout": { - "margin": { - "bottom": 10 - } - }, - "statementLayout": { - "contentInset": True, + "titleLayout": { + "horizontalContentAlignment": "center", + "columnSpan": 5, + "columnStart": 1, "margin": { - "bottom": 10, - "top": 10 + "bottom": 5, + "top": 5 } }, - "subHeaderLayout": { + "captionLayout": { "horizontalContentAlignment": "left", + "columnSpan": 7, + "columnStart": 0, "margin": { - "bottom": 10, - "top": 15 + "bottom": 5, + "top": 5 } }, - "titleLayout": { - "columnSpan": 7, - "columnStart": 0, + "BodyCaptionLayout": { + "horizontalContentAlignment": "left", + "columnSpan": 5, + "columnStart": 1, "margin": { - "bottom": 15, + "bottom": 5, "top": 5 } }, - "verdictContainerLayout": { - "contentInset": True, - "ignoreDocumentMargin": True, + "bylineLayout": { + "columnSpan": 5, + "columnStart": 1, "margin": { - "bottom": 15, + "bottom": 2, "top": 5 } }, - "verdictLayout": { + "dateLineLayout": { + "columnSpan": 5, + "columnStart": 1, "margin": { - "bottom": 20 + "bottom": 5, + "top": 2 } } } @@ -223,7 +296,7 @@ def _set_component_styles(self, apple_news): } apple_news['componentTextStyles'] = { "bodyStyle": { - "fontName": "Merriweather-Regular", + "fontName": "HelveticaNeue", "fontSize": 16, "lineHeight": 26, "linkStyle": { @@ -235,55 +308,32 @@ def _set_component_styles(self, apple_news): "textAlignment": "left", "textColor": "#000" }, - "claimTagStyle": { - "fontName": "Merriweather-Bold", + "bylineStyle": { + "fontName": "HelveticaNeue-Bold", "fontSize": 18, - "lineHeight": 17, - "textAlignment": "left", - "textColor": "#FFF", - "textShadow": { - "color": "#000", - "offset": { - "x": 1, - "y": 1 - }, - "opacity": 0.5, - "radius": 2 - } - }, - "statementAttributionStyle": { - "fontName": "Merriweather-Italic", - "fontSize": 14, - "hyphenation": False, - "lineHeight": 22, - "textAlignment": "right", + "lineHeight": 18, + "textAlignment": "center", "textColor": "#000" }, - "statementStyle": { - "fontName": "Merriweather-BoldItalic", + "dateLineStyle": { + "fontName": "HelveticaNeue-Bold", "fontSize": 18, - "hyphenation": False, - "lineHeight": 26, - "textColor": "#FFF" - }, - "subHeaderStyle": { - "fontName": "FiraSans-Bold", - "fontSize": 30, - "hyphenation": False, - "lineHeight": 40, - "textColor": "#063c7f" + "lineHeight": 18, + "textAlignment": "center", + "textColor": "#000" }, "titleStyle": { - "fontName": "Merriweather-Black", + "fontName": "HelveticaNeue-CondensedBlack", "fontSize": 40, "lineHeight": 50, - "textAlignment": "left", - "textColor": "#FFF" + "textAlignment": "center", + "textColor": "#000" }, - "verdictStyle": { - "fontName": "Merriweather-Regular", - "fontSize": 18, - "lineHeight": 26, + "captionStyle": { + "fontName": "HelveticaNeue-Italic", + "fontSize": 12, + "hyphenation": False, + "lineHeight": 15, "textAlignment": "left", "textColor": "#000" } @@ -291,400 +341,224 @@ def _set_component_styles(self, apple_news): def _set_component(self, apple_news, article): components = [] + components.extend(self._set_header_component(article)) + components.extend(self._set_story_component(article)) apple_news['components'] = components - components.append(self._set_header_component(article)) - components.extend(self._set_statement_component(article)) - components.append({ - 'layout': { - 'horizontalContentAlignment': 'right', - 'margin': { - 'bottom': 5 - }, - 'maximumContentWidth': 180 - }, - 'role': 'divider', - 'stroke': { - 'color': '#063c7f', - 'style': 'dashed', - 'width': 1 - } - }) - components.extend(self._set_verdict_component(article, '_verdict1')) - components.extend(self._set_analysis_component(article)) - components.extend(self._set_verdict_component(article, '_verdict2')) - components.extend(self._set_references_component(article)) - components.extend(self._set_revision_history_component(article)) def _set_header_component(self, article): - header = { + header = [{ 'behaviour': {'type': 'background_parallax'}, 'layout': 'fixed_image_header_container', 'role': 'container', 'style': { 'fill': { - 'URL': 'bundle://header.jpg', + 'URL': 'bundle://featuremedia', 'type': 'image' } - }, - 'components': [ - { - 'anchor': { - 'originAnchorPosition': 'bottom', - 'targetAnchorPosition': 'bottom' - }, - 'components': [ - { - "layout": "titleLayout", - "role": "title", - "text": article.get('_title'), - "textStyle": "titleStyle" - } - ], - 'layout': 'fixed_image_header_section', - 'role': 'section', - 'style': { - 'fill': { - 'angle': 180, - 'colorStops': [ - {'color': '#00000000'}, - {'color': '#063c7f'} - ], - 'type': 'linear_gradient' - } - } - } - ] + } + }, + { + "layout": "captionLayout", + "role": "caption", + "text": "{} - {}".format( + article.get('associations', {}).get('featuremedia', {}).get('description_text', ''), + article.get('associations', {}).get('featuremedia', {}).get('byline', '')), + "textStyle": 'captionStyle' } + ] if not self._is_featuremedia_exists(article): - header.pop('style', None) + return [] return header - def _set_statement_component(self, article): - """Set the statement component - - :param dict article: + def _add_pieces(self, body, pieces, role, embed_url): """ - if not article.get('_statement'): - return [] - - return [ - { - 'layout': 'subHeaderLayout', - 'role': 'heading', - 'text': 'The Statement', - 'textStyle': 'subHeaderStyle' - }, - { - 'layout': 'statementLayout', - 'role': 'body', - 'style': { - 'backgroundColor': '#063c7f' - }, - 'text': article.get('_statement'), - 'textStyle': 'statementStyle' - }, - { - 'layout': 'statementAttributionLayout', - 'role': 'body', - 'text': article.get('_statement_attribution'), - 'textStyle': 'statementAttributionStyle' - } - ] - - def _set_analysis_component(self, article): - """Set the analysis component - - :param dict article: + Adds the content so far to the body content, then adds the embed, and clears the pieces + :param body: the body built so far + :param pieces: the pieces accumulated + :param role: + :param embed_url: + :return: """ - if not article.get('_analysis'): - return [] + body.extend([{ + 'format': 'html', + 'layout': 'bodyLayout', + 'role': 'body', + 'text': ''.join(pieces), + 'textStyle': 'bodyStyle' + }, { + "role": role, + "layout": "bodyLayout", + "URL": embed_url + }]) + pieces.clear() + return + + def generate_article_content(self, article): + + fragments = lxml_html.fragments_fromstring(article.get('body_html', '
')) + par_pieces = [] + body_content = [] + + for elem in fragments: + if elem.tag == 'figure': + key = elem.find('./img').attrib['id'] + body_content.extend([ + { + 'format': 'html', + 'layout': 'bodyLayout', + 'role': 'body', + 'text': ''.join(par_pieces), + 'textStyle': 'bodyStyle' + }, + { + 'role': 'figure', + 'URL': 'bundle://{}'.format(key), + 'identifier': key, + 'accessibilityCaption': elem.find('./img').attrib['alt'], + 'caption': elem.find('./figcaption').text, + 'layout': 'bodyLayout' + }, + { + "layout": "BodyCaptionLayout", + "role": "caption", + "text": elem.find('./figcaption').text, + "textStyle": 'captionStyle' + } + ]) + par_pieces.clear() + elif elem.tag == 'div' and 'embed-block' in elem.attrib.get('class', ''): + bq = elem.find('./blockquote') + if bq is not None: + if bq.attrib.get('class') == 'twitter-tweet': + tweet = bq.find('./a').attrib.get('href', '') + if 'twitter' in tweet: + self._add_pieces(body_content, par_pieces, "tweet", tweet) + elif bq.attrib.get('class') == 'instagram-media': + insta_link = bq.attrib.get('data-instgrm-permalink') + if insta_link: + self._add_pieces(body_content, par_pieces, "instagram", insta_link) + elif bq.attrib.get('class') == 'tiktok-embed': + tiktok = bq.attrib.get('cite') + if tiktok: + self._add_pieces(body_content, par_pieces, "tiktok", tiktok) + else: + iframe = elem.find("./iframe") + if iframe is not None: + src = iframe.attrib.get('src') + if src: + url = urlparse(src) + query = unquote(url.query) + if query.startswith('href='): + fburl = query[len('href='):] + self._add_pieces(body_content, par_pieces, 'facebook_post', fburl) + else: + par_pieces.append(render_fragment(elem)) + # Add what is left over + body_content.append({ + 'format': 'html', + 'layout': 'bodyLayout', + 'role': 'body', + 'text': ''.join(par_pieces), + 'textStyle': 'bodyStyle' + }) - return [ - { - 'layout': 'subHeaderLayout', - 'role': 'heading', - 'text': 'The Analysis', - 'textStyle': 'subHeaderStyle' - }, - { + if article.get('body_footer', '') != '': + body_content.append({ 'format': 'html', 'layout': 'bodyLayout', 'role': 'body', - 'text': article.get('_analysis'), - 'textStyle': 'bodyStyle' - } - ] + 'text': article.get('body_footer', ''), + 'textStyle': 'bodyStyle'} + ) - def _set_verdict_component(self, article, field_name): - """Set the verdict component + return body_content - :param dict article: - """ - if not article.get(field_name): - return [] + def _set_story_component(self, article): - return [ + article_body = self.generate_article_content(article) + + story_component = [ { - 'components': [ - { - 'layout': 'subHeaderLayout', - 'role': 'heading', - 'text': 'The Verdict', - 'textStyle': 'subHeaderStyle' - }, - { - 'format': 'html', - 'layout': 'verdictLayout', - 'role': 'body', - 'text': article.get(field_name), - 'textStyle': 'verdictStyle' + "layout": "titleLayout", + "role": "title", + "text": article.get('headline'), + "textStyle": "titleStyle", + "format": "html" + }, + { + 'role': 'divider', + 'layout': { + 'columnStart': 2, + 'columnSpan': 3, + 'margin': { + 'top': 5, + 'bottom': 5 } - ], - 'layout': 'verdictContainerLayout', - 'role': 'container', - 'animation': { - 'type': 'move_in', - 'preferredStartingPosition': 'left' }, - 'style': { - 'backgroundColor': '#e7ebf1' + 'stroke': { + 'color': '#063c7f', + 'style': 'solid', + 'width': 1 } - } - ] - - def _set_references_component(self, article): - """Set the references component - - :param dict article: - """ - if not article.get('_references'): - return [] - - return [ - { - "layout": "subHeaderLayout", - "role": "heading", - "text": "The References", - "textStyle": "subHeaderStyle" }, { - "format": "html", - "layout": "bodyLayout", - "role": "body", - "text": article.get('_references'), - "textStyle": "bodyStyle" - } - ] - - def _set_revision_history_component(self, article): - """Set the revision history component - - :param dict article: - """ - if not article.get('_revision_history'): - return [] - - return [ - { - "layout": "subHeaderLayout", - "role": "heading", - "text": "Revision History", - "textStyle": "subHeaderStyle" + 'role': 'byline', + 'text': 'By {}'.format(article.get('byline')), + 'layout': 'bylineLayout', + 'textStyle': 'bylineStyle' }, { - "format": "html", - "layout": "bodyLayout", - "role": "body", - "text": article.get('_revision_history'), - "textStyle": "bodyStyle" + 'role': 'byline', + 'text': self.format_dateline(article.get('dateline', {}).get('located'), + get_date(article.get('versioncreated'))), + 'layout': 'dateLineLayout', + 'textStyle': 'dateLineStyle' } ] - - def _parse_content(self, article): - """Parse body_html and mapping to fields required for apple news format - - :param article: - """ - statement_regex = re.compile(r'^The Statement$', re.IGNORECASE) - analysis_regex = re.compile(r'^The Analysis$', re.IGNORECASE) - verdict_regex = re.compile(r'^The Verdict$', re.IGNORECASE) - references_regex = re.compile(r'^The References$', re.IGNORECASE) - abstract = get_text(article.get('abstract'), content='html').strip() - - article['_title'] = abstract - body_html = article.get('body_html') - article['_analysis_first_line'] = '' - article['_analysis'] = '' - article['_statement'] = '' - article['_statement_attribution'] = '' - article['_verdict1'] = '' - article['_verdict2'] = '' - article['_references'] = '' - article['_revision_history'] = '' - - if article.get(ITEM_STATE) == CONTENT_STATE.KILLED or article.get(ITEM_STATE) == CONTENT_STATE.RECALLED: - article['_title'] = 'This article has been removed.' - article['_analysis_first_line'] = 'This article has been removed.' - article['_analysis'] = 'This article has been removed.' - article['_statement'] = 'This article has been removed.' - article['_statement_attribution'] = 'This article has been removed.' - article['_verdict1'] = 'This article has been removed.' - article['_verdict2'] = 'This article has been removed.' - article['_references'] = 'This article has been removed.' - self._set_revision_history(article) - return - - parsed_content = parse_html(body_html, content='html') - statement_found = False - analysis_found = False - analysis_first_line = False - verdict1_found = False - verdict2_found = False - references_found = False - statement_elements = [] - - for top_level_tag in parsed_content.xpath('/div/child::*'): - tag_text = format_text_content(top_level_tag).strip() - if not tag_text: - continue - - if not verdict1_found: - if not statement_found: - match = statement_regex.search(tag_text) - if match: - statement_found = True - continue - else: - # statement found - match = verdict_regex.search(tag_text) - if match: - verdict1_found = True - if len(statement_elements) > 1: - statement_length = len(statement_elements) - 1 - for i in range(statement_length): - article['_statement'] += get_text( - to_string(statement_elements[i], remove_root_div=False), - content='html' - ).strip() - if statement_length > 1 and i != statement_length - 1: - article['_statement'] += '\r\n' - - article['_statement_attribution'] = get_text( - to_string(statement_elements[-1:][0], remove_root_div=False), - content='html' - ).strip() - elif len(statement_elements) == 1: - article['_statement'] = to_string( - statement_elements[0], - remove_root_div=False - ) - continue - - statement_elements.append(top_level_tag) - continue - - if verdict1_found and not analysis_found: - match = analysis_regex.search(tag_text) - if match: - analysis_found = True - else: - article['_verdict1'] += to_string(top_level_tag, remove_root_div=False) - continue - - if analysis_found and not verdict2_found: - if not analysis_first_line: - article['_analysis_first_line'] = tag_text - analysis_first_line = True - - match = verdict_regex.search(tag_text) - if match: - verdict2_found = True - else: - article['_analysis'] += to_string(top_level_tag, remove_root_div=False) - continue - - if verdict2_found and not references_found: - match = references_regex.search(tag_text) - if match: - references_found = True - else: - article['_verdict2'] += to_string(top_level_tag, remove_root_div=False) - continue - - if references_found: - tag_text = re.sub(r'^\d*\s*[.):]?', '', tag_text).strip() - - article['_references'] += 'The Statement
' 'This is statement first line
' 'This is statement second line
' @@ -73,19 +64,8 @@ def _get_article(self): } - def test_can_format_fact_check(self): + def test_can_format_check(self): self.assertTrue( - self.formatter.can_format( - self.formatter.format_type, - { - 'type': 'text', - 'genre': [{'qcode': 'Fact Check'}], - 'format': 'HTML' - } - ) - ) - - self.assertFalse( self.formatter.can_format( self.formatter.format_type, { @@ -96,330 +76,413 @@ def test_can_format_fact_check(self): ) ) - def test_parse_statement(self): - article = self._get_article() - self.formatter._parse_content(article) - self.assertEqual(article.get('_statement'), 'This is statement first line') - self.assertEqual(article.get('_statement_attribution'), 'This is statement second line') - self.assertEqual( - article.get('_analysis'), - 'This is analysis first line
' - 'This is analysis second line
' - ) - self.assertEqual( - article.get('_verdict1'), - 'This is verdict 1 first line
' - 'This is verdict 1 second line
' - ) - - self.assertEqual( - article.get('_verdict2'), - 'This is verdict 2 first line
' - 'This is verdict 2 second line
' - ) - - self.assertEqual( - article.get('_references'), - 'The Statement
'\ - 'This is statement first line
' \ - '' \ - 'The Verdict
' \ - 'This is verdict first line
' \ - 'This is verdict second line
' \ - '' \ - 'The Analysis
'\ - 'This is analysis first line
'\ - 'This is analysis second line
'\ - ''\ - 'The Verdict
'\ - 'This is verdict first line
'\ - 'This is verdict second line
'\ - ''\ - 'The References
'\ - '1. This is references http://test.com
'\ - '2. This is references second line
'\ - '' - with self.assertRaises(Exception) as ex_context: - self.formatter._format(article) - self.assertIn('Cannot format the article for Apple News', ex_context.exception) - - def test_format_article_raises_exception_if_analysis_missing(self): + def test_format_title(self): article = self._get_article() - article['body_html'] = 'The Statement
'\ - 'This is statement first line
'\ - ''\ - 'The Verdict
'\ - 'This is verdict first line
'\ - 'This is verdict second line
'\ - ''\ - 'The References
'\ - '1. This is references http://test.com
'\ - '2. This is references second line
'\ - '' - with self.assertRaises(Exception) as ex_context: - self.formatter._format(article) - self.assertIn('Cannot format the article for Apple News', ex_context.exception) + apple_news = self.formatter._format(article) + self.assertEqual(apple_news.get('identifier'), '1') + self.assertEqual(apple_news.get('title'), 'Headline of the story') + self.assertEqual(apple_news.get('components'), [{"layout": "titleLayout", + "role": "title", "text": "Headline of the story", + "textStyle": "titleStyle", + "format": "html"}, + {"role": "divider", + "layout": {"columnStart": 2, "columnSpan": 3, + "margin": {"top": 5, "bottom": 5}}, + "stroke": {"color": "#063c7f", "style": "solid", "width": 1}}, + {"role": "byline", "text": "By John Doe", + "layout": "bylineLayout", + "textStyle": "bylineStyle"}, + {"role": "byline", "text": "SYDNEY, Feb 16 at 12:45AM", + "layout": "dateLineLayout", "textStyle": "dateLineStyle"}, + {"format": "html", "layout": "bodyLayout", "role": "body", + "text": "The Statement
" + "This is statement first line
" + "This is statement second line
" + "The Verdict
" + "This is verdict 1 first line
" + "This is verdict 1 second line
" + "The Analysis
" + "This is analysis first line
" + "This is analysis second line
" + "The Verdict
" + "This is verdict 2 first line
" + "This is verdict 2 second line
" + "The References
" + "1. This is references http://test.com
" + "2. This is references second line
The Statement
'\ - 'This is statement first line
' \ - 'This is statement second line
' \ - ''\ - 'The Analysis
'\ - 'This is analysis first line
'\ - 'This is analysis second line
'\ - ''\ - 'The References
'\ - '1. This is references http://test.com
'\ - '2. This is references second line
'\ - '' - with self.assertRaises(Exception) as ex_context: - self.formatter._format(article) - self.assertIn('Cannot format the article for Apple News', ex_context.exception) + article['associations'] = {'featuremedia': {'description_text': 'Protesters participate in a Halloween themed ' + 'Extinction Rebellion rally in Sydney, ' + 'Thursday, October 31, 2019.'}, + 'editor_0': {'type': 'video'}, + 'editor_1': {'type': 'picture'}} + article['fields_meta'] = { + "body_html": { + "draftjsState": [ + { + "blocks": [ + { + "key": "f8mk1", + "text": "First paragraph", + "type": "unstyled", + "depth": 0, + "inlineStyleRanges": [], + "entityRanges": [], + "data": { + "MULTIPLE_HIGHLIGHTS": {} + } + }, + { + "key": "97qeo", + "text": " ", + "type": "atomic", + "depth": 0, + "inlineStyleRanges": [], + "entityRanges": [ + { + "offset": 0, + "length": 1, + "key": 0 + } + ], + "data": {} + }, + { + "key": "bu6bt", + "text": "Second paragraph", + "type": "unstyled", + "depth": 0, + "inlineStyleRanges": [], + "entityRanges": [], + "data": {} + }, + { + "key": "66lpo", + "text": "Third paragraph", + "type": "unstyled", + "depth": 0, + "inlineStyleRanges": [], + "entityRanges": [], + "data": {} + }, + { + "key": "4sgtb", + "text": " ", + "type": "atomic", + "depth": 0, + "inlineStyleRanges": [], + "entityRanges": [ + { + "offset": 0, + "length": 1, + "key": 1 + } + ], + "data": {} + }, + { + "key": "9n4jj", + "text": "Fourth paragraph", + "type": "unstyled", + "depth": 0, + "inlineStyleRanges": [], + "entityRanges": [], + "data": {} + }, + { + "key": "1trdb", + "text": "Fifth paragraph", + "type": "unstyled", + "depth": 0, + "inlineStyleRanges": [], + "entityRanges": [], + "data": {} + }, + { + "key": "2jrhi", + "text": " ", + "type": "atomic", + "depth": 0, + "inlineStyleRanges": [], + "entityRanges": [ + { + "offset": 0, + "length": 1, + "key": 2 + } + ], + "data": {} + }, + { + "key": "d51og", + "text": "Sixth paragraph", + "type": "unstyled", + "depth": 0, + "inlineStyleRanges": [], + "entityRanges": [], + "data": {} + } + ], + "entityMap": { + "0": { + "type": "MEDIA", + "mutability": "MUTABLE", + "data": { + } + }, + "1": { + "type": "MEDIA", + "mutability": "MUTABLE", + "data": { + "media": { + "headline": "POLESTAR ELECTRIC VEHICLE", + "alt_text": "Alt Text", + "description_text": "Description text or caption", + "source": "PR Handout Image", + "byline": "PR Handout Image/POLESTAR", + "type": "picture", + "format": "HTML", + } + } + }, + "2": { + "type": "EMBED", + "mutability": "MUTABLE", + "data": { + "data": { + "html": "" + } + } + } + } + } + ] + }} + apple_news = self.formatter._format(article) + self.assertEqual(apple_news['components'][7]['URL'], 'bundle://editor_1') + self.assertEqual(apple_news['components'][0]['style']['fill']['URL'], 'bundle://featuremedia') + self.assertEqual(apple_news['components'][10]['URL'], 'https://twitter.com/AAPNewswire/status/1') - def test_format_article_raises_exception_if_references_missing(self): + def test_format_article_with_instagram(self): article = self._get_article() - article['body_html'] = '" + ""This is actually my first time to ever enter a competition" + "."
— " + "Australian Associated Press (AAP) (@AAPNewswire) " + "" + "November 16, 2023
Photographer Jialing Cai went diving in the dark to " + "capture her award-winning image of a female paper nautilus, a type " + "of octopus that can grow its own shell.
Via " + "" + "@liz: " + "https://t.co/u1rGHr1heD " + "pic.twitter.com/SIBTwJfisP
The Statement
'\ - 'This is statement first line
' \ - 'This is statement second line
' \ - ''\ - 'The Analysis
'\ - 'This is analysis first line
'\ - 'This is analysis second line
'\ - ''\ - 'The Verdict
'\ - 'This is verdict first line
'\ - 'This is verdict second line
'\ - '' - with self.assertRaises(Exception) as ex_context: - self.formatter._format(article) - self.assertIn('Cannot format the article for Apple News', ex_context.exception) + article['fields_meta'] = { + "body_html": { + "draftjsState": [ + { + "blocks": [ + { + "key": "bkf9p", + "text": "instagram", + "type": "unstyled", + "depth": 0, + "inlineStyleRanges": [], + "entityRanges": [], + "data": { + "MULTIPLE_HIGHLIGHTS": {} + } + }, + { + "key": "ed90t", + "text": " ", + "type": "atomic", + "depth": 0, + "inlineStyleRanges": [], + "entityRanges": [ + { + "offset": 0, + "length": 1, + "key": 0 + } + ], + "data": {} + }, + { + "key": "30a8e", + "text": "", + "type": "unstyled", + "depth": 0, + "inlineStyleRanges": [], + "entityRanges": [], + "data": {} + } + ], + "entityMap": { + "0": { + "type": "EMBED", + "mutability": "MUTABLE", + "data": { + "data": { + "html": " " + }, + "description": "Test Instagram post" + } + } + } + } + ] + } + } + apple_news = self.formatter._format(article) + self.assertEqual(apple_news['components'][5]['URL'], "https://www.instagram.com/reel/C") - def test_format_title(self): + def test_format_article_with_facebook(self): article = self._get_article() + article['fields_meta'] = { + "body_html": { + "draftjsState": [ + { + "blocks": [ + { + "key": "tqgt", + "text": "Facebook post", + "type": "unstyled", + "depth": 0, + "inlineStyleRanges": [], + "entityRanges": [], + "data": { + "MULTIPLE_HIGHLIGHTS": {} + } + }, + { + "key": "b0nn5", + "text": " ", + "type": "atomic", + "depth": 0, + "inlineStyleRanges": [], + "entityRanges": [ + { + "offset": 0, + "length": 1, + "key": 0 + } + ], + "data": {} + }, + { + "key": "1loq9", + "text": "Following text", + "type": "unstyled", + "depth": 0, + "inlineStyleRanges": [], + "entityRanges": [], + "data": {} + } + ], + "entityMap": { + "0": { + "type": "EMBED", + "mutability": "MUTABLE", + "data": { + "data": { + "html": "" + }, + "description": "Embed description" + } + } + } + } + ] + } + } apple_news = self.formatter._format(article) - self.assertEqual(apple_news.get('identifier'), '1') - self.assertEqual(apple_news.get('title'), 'This is abstract') - self.assertEqual(apple_news.get('subtitle'), 'This is analysis first line') - self.assertEqual(apple_news.get('components'), - [ - { - 'behaviour': { - 'type': 'background_parallax' - }, - 'components': [{ - 'anchor': { - 'originAnchorPosition': 'bottom', - 'targetAnchorPosition': 'bottom' - }, - 'components': [{ - 'layout': 'titleLayout', - 'role': 'title', - 'text': 'This is abstract', - 'textStyle': 'titleStyle' - }], - 'layout': 'fixed_image_header_section', - 'role': 'section', - 'style': { - 'fill': { - 'angle': 180, - 'colorStops': [ - {'color': '#00000000'}, - {'color': '#063c7f'} - ], - 'type': 'linear_gradient' - } - } - }], - 'layout': 'fixed_image_header_container', - 'role': 'container' - }, - { - 'layout': 'subHeaderLayout', - 'role': 'heading', - 'text': 'The Statement', - 'textStyle': 'subHeaderStyle' - }, - { - 'layout': 'statementLayout', - 'role': 'body', - 'style': { - 'backgroundColor': '#063c7f' - }, - 'text': 'This is statement first line', - 'textStyle': 'statementStyle' - }, - { - 'layout': 'statementAttributionLayout', - 'role': 'body', - 'text': 'This is statement second line', - 'textStyle': 'statementAttributionStyle' - }, - { - 'layout': { - 'horizontalContentAlignment': 'right', - 'margin': { - 'bottom': 5 - }, - 'maximumContentWidth': 180 - }, - 'role': 'divider', - 'stroke': { - 'color': '#063c7f', - 'style': 'dashed', - 'width': 1 - } - }, - { - 'animation': { - 'preferredStartingPosition': 'left', - 'type': 'move_in' - }, - 'components': [ - { - 'layout': 'subHeaderLayout', - 'role': 'heading', - 'text': 'The Verdict', - 'textStyle': 'subHeaderStyle' - }, - { - 'format': 'html', - 'layout': 'verdictLayout', - 'role': 'body', - 'text': 'This is verdict 1 first line
' - 'This is verdict 1 second line
', - 'textStyle': 'verdictStyle' - } - ], - 'layout': 'verdictContainerLayout', - 'role': 'container', - 'style': { - 'backgroundColor': '#e7ebf1' - } - }, - { - 'layout': 'subHeaderLayout', - 'role': 'heading', - 'text': 'The Analysis', - 'textStyle': 'subHeaderStyle' - }, - { - 'format': 'html', - 'layout': 'bodyLayout', - 'role': 'body', - 'text': 'This is analysis first line
' - 'This is analysis second line
', - 'textStyle': 'bodyStyle' - }, - { - 'animation': { - 'preferredStartingPosition': 'left', - 'type': 'move_in' - }, - 'components': [ - { - 'layout': 'subHeaderLayout', - 'role': 'heading', - 'text': 'The Verdict', - 'textStyle': 'subHeaderStyle' - }, - { - 'format': 'html', - 'layout': 'verdictLayout', - 'role': 'body', - 'text': 'This is verdict 2 first line
' - 'This is verdict 2 second line
', - 'textStyle': 'verdictStyle' - } - ], - 'layout': 'verdictContainerLayout', - 'role': 'container', - 'style': { - 'backgroundColor': '#e7ebf1' - } - }, - { - 'layout': 'subHeaderLayout', - 'role': 'heading', - 'text': 'The References', - 'textStyle': 'subHeaderStyle' - }, - { - 'format': 'html', - 'layout': 'bodyLayout', - 'role': 'body', - 'text': '