Skip to content

Commit

Permalink
Remove embeds from selected downloads
Browse files Browse the repository at this point in the history
  • Loading branch information
marwoodandrew committed Oct 16, 2023
1 parent 4847ea3 commit 7511520
Show file tree
Hide file tree
Showing 9 changed files with 97 additions and 43 deletions.
39 changes: 39 additions & 0 deletions features/news_api_item.feature
Original file line number Diff line number Diff line change
Expand Up @@ -204,4 +204,43 @@ Feature: News API Item
"headline": "headline 1",
"associations": {"featuremedia": {"renditions": {"original": {}} }}
}
"""

Scenario: Item request response strips embeds
Given "items"
"""
[{"_id": "111", "body_html": "<p>Once upon a time there was </p><div class=\"embed-block\">a fish</div><p> who could swim</p><p><!-- EMBED START Image {id: \"editor_19\"} --><figure><img src=\"somthing\" alt=\"alt text\" id=\"editor_19\"<figcaption>Some caption</figcaption></figure><!-- EMBED END Image {id: \"editor_19\"} --></p>",
"headline": "headline 1",
"firstpublished": "#DATE-1#", "versioncreated": "#DATE#",
"associations": {"editor_19": {"products": [{"code": "1234"}], "renditions": {"original": {}} }}}]
"""
Given "products"
"""
[{"name": "A fishy Product",
"decsription": "a product for those interested in fish",
"companies" : [
"#companies._id#"
],
"query": "Once upon a time",
"product_type": "news_api"
},
{"name": "A fishy superdesk product",
"description": "a superdesk product restricting images in the atom feed",
"companies" : [
"#companies._id#"
],
"sd_product_id": "1234",
"product_type": "news_api"
}
]
"""
When we get "/news/item/111?format=NINJSFormatter3"
Then we get existing resource
"""
{
"guid": "111",
"headline": "headline 1",
"body_html": "<p>Once upon a time there was </p><p> who could swim</p><p></p>",
"associations": {}
}
"""
29 changes: 1 addition & 28 deletions newsroom/monitoring/utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
from flask import current_app as app
from lxml import html as lxml_html
import re
import collections
from superdesk.text_utils import get_text
from newsroom.utils import get_items_by_id
from newsroom.utils import get_items_by_id, remove_all_embeds
from superdesk import etree as sd_etree


Expand Down Expand Up @@ -69,28 +67,3 @@ def get_items_for_monitoring_report(_ids, monitoring_profile, full_text=False):
items = get_items_by_id(_ids, 'items')
truncate_article_body(items, monitoring_profile, full_text)
return items


def remove_all_embeds(item):
"""
Remove the all embeds from the body of the article
:param item:
:return:
"""
root_elem = lxml_html.fromstring(item.get('body_html') or '<p></p>')
regex = r" EMBED START (?:Image|Video|Audio) {id: \"editor_([0-9]+)"
html_updated = False
comments = root_elem.xpath('//comment()')
for comment in comments:
m = re.search(regex, comment.text)
# if we've found an Embed Start comment
if m and m.group(1):
parent = comment.getparent()
for elem in comment.itersiblings():
parent.remove(elem)
if elem.text and ' EMBED END ' in elem.text:
break
parent.remove(comment)
html_updated = True
if html_updated:
item["body_html"] = sd_etree.to_string(root_elem, method="html")
34 changes: 34 additions & 0 deletions newsroom/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pytz
import re
from lxml import html as lxml_html
from lxml.html import clean

from superdesk.etree import to_string
from superdesk.utc import utcnow
Expand Down Expand Up @@ -460,3 +461,36 @@ def update_embeds_in_body(item, update_image=None, update_audio=None, update_vid
body_updated = update_video(item, elem, m.group(1)) or body_updated
if body_updated:
item['body_html'] = to_string(root_elem, method="html")


def remove_all_embeds(item):
"""
Remove the all embeds from the body of the article, including any divs with the embed_block attribute
:param item:
:return:
"""

if not item.get("body_html", ""):
return

# clean all the embedded figures from the html
blacklist = ["figure"]
root_elem = lxml_html.fromstring(item.get("body_html", ""))

cleaner = clean.Cleaner(
add_nofollow=False,
kill_tags=blacklist
)
cleaned_xhtml = cleaner.clean_html(root_elem)

# all embedded tweets etc should be in a div with the class embeded-block, these are removed
embeds = cleaned_xhtml.xpath('//div[@class=\'embed-block\']')
for embed in embeds:
cleaned_xhtml.remove(embed)

# remove the associations relating to the embeds
kill_keys = [key for key in item.get("associations", {}) if key.startswith("editor_")]
for key in kill_keys:
item.get("associations", {}).pop(key, None)

item["body_html"] = to_string(cleaned_xhtml, encoding="unicode", method='html')
16 changes: 2 additions & 14 deletions newsroom/wire/formatters/html.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import flask
from .base import BaseFormatter
from lxml import html as lxml_html
from lxml.html import clean
from lxml import etree
from newsroom.utils import remove_all_embeds


class HTMLFormatter(BaseFormatter):
Expand All @@ -14,17 +12,7 @@ class HTMLFormatter(BaseFormatter):
MIMETYPE = 'text/html'

def format_item(self, item, item_type='items'):

# clean all the embedded figures from the html
blacklist = ["figure"]
root_elem = lxml_html.fromstring(item.get("body_html", ""))
cleaner = clean.Cleaner(
add_nofollow=False,
kill_tags=blacklist
)
cleaned_xhtml = cleaner.clean_html(root_elem)

item["body_html"] = etree.tostring(cleaned_xhtml, encoding="unicode", method='html')
remove_all_embeds(item)

if item_type == 'items':
return str.encode(flask.render_template('download_item.html', item=item), 'utf-8')
Expand Down
2 changes: 2 additions & 0 deletions newsroom/wire/formatters/newsmlg2.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from superdesk.publish.formatters.nitf_formatter import NITFFormatter
from superdesk.publish.formatters.newsml_g2_formatter import NewsMLG2Formatter as SuperdeskFormatter
from newsroom.utils import remove_all_embeds

from .base import BaseFormatter

Expand Down Expand Up @@ -34,6 +35,7 @@ class NewsMLG2Formatter(BaseFormatter):
nitf_formatter = NITFFormatter()

def format_item(self, item, item_type='items'):
remove_all_embeds(item)
item = item.copy()
item.setdefault('guid', item['_id'])
item.setdefault('_current_version', item['version'])
Expand Down
13 changes: 13 additions & 0 deletions newsroom/wire/formatters/ninjs2.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from .ninjs import NINJSFormatter
from newsroom.news_api.utils import check_featuremedia_association_permission
from newsroom.wire.formatters.utils import remove_internal_renditions
from newsroom.utils import remove_all_embeds


class NINJSFormatter2(NINJSFormatter):
Expand All @@ -18,3 +19,15 @@ def _transform_to_ninjs(self, item):
if not item.get('associations'):
item.pop('associations', None)
return remove_internal_renditions(super()._transform_to_ninjs(item), remove_media=True)


class NINJSFormatter3(NINJSFormatter2):
"""
Format with no Embeds
"""

def _transform_to_ninjs(self, item):
remove_all_embeds(item)
ninjs = super()._transform_to_ninjs(item)
# do stuff
return ninjs
2 changes: 2 additions & 0 deletions newsroom/wire/formatters/nitf.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@

from lxml import etree
from superdesk.publish.formatters.nitf_formatter import NITFFormatter as SuperdeskNITFFormatter
from newsroom.utils import remove_all_embeds

from .base import BaseFormatter

Expand All @@ -14,6 +15,7 @@ class NITFFormatter(BaseFormatter):
formatter = SuperdeskNITFFormatter()

def format_item(self, item, item_type='items'):
remove_all_embeds(item)
dest = {}
nitf = self.formatter.get_nitf(item, dest, '')
return etree.tostring(nitf, xml_declaration=True, pretty_print=True, encoding=self.encoding)
2 changes: 2 additions & 0 deletions newsroom/wire/formatters/text.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@

import flask
from .base import BaseFormatter
from newsroom.utils import remove_all_embeds


class TextFormatter(BaseFormatter):
Expand All @@ -9,6 +10,7 @@ class TextFormatter(BaseFormatter):
MIMETYPE = 'text/plain'

def format_item(self, item, item_type='items'):
remove_all_embeds(item)
if item_type == 'items':
return str.encode(flask.render_template('download_item.txt', item=item), 'utf-8')
else:
Expand Down
3 changes: 2 additions & 1 deletion newsroom/wire/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,8 @@ def download(_ids):

update_action_list(_ids.split(','), 'downloads', force_insert=True)
get_resource_service('history').create_history_record(items, 'download', user, request.args.get('type', 'wire'))
return flask.send_file(_file, mimetype=mimetype, attachment_filename=attachment_filename, as_attachment=True)
return flask.send_file(_file, mimetype=mimetype, attachment_filename=attachment_filename, as_attachment=True,
cache_timeout=0)


@blueprint.route('/wire_share', methods=['POST'])
Expand Down

0 comments on commit 7511520

Please sign in to comment.