Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove embeds from selected downloads #1181

Merged
merged 2 commits into from
Dec 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 68 additions & 0 deletions features/news_api_item.feature
Original file line number Diff line number Diff line change
Expand Up @@ -204,4 +204,72 @@ Feature: News API Item
"headline": "headline 1",
"associations": {"featuremedia": {"renditions": {"original": {}} }}
}
"""

Scenario: Item request response strips embeds
Given "items"
"""
[{"_id": "111", "body_html": "<p>Once upon a time there was </p><div class=\"embed-block\">a fish</div><p> who could swim</p><p><!-- EMBED START Image {id: \"editor_19\"} --><figure><img src=\"somthing\" alt=\"alt text\" id=\"editor_19\"<figcaption>Some caption</figcaption></figure><!-- EMBED END Image {id: \"editor_19\"} --></p>",
"headline": "headline 1",
"firstpublished": "#DATE-1#", "versioncreated": "#DATE#",
"associations": {"editor_19": {"products": [{"code": "1234"}], "renditions": {"original": {}} }}}]
"""
Given "products"
"""
[{"name": "A fishy Product",
"decsription": "a product for those interested in fish",
"companies" : [
"#companies._id#"
],
"query": "Once upon a time",
"product_type": "news_api"
},
{"name": "A fishy superdesk product",
"description": "a superdesk product restricting images in the atom feed",
"companies" : [
"#companies._id#"
],
"sd_product_id": "1234",
"product_type": "news_api"
}
]
"""
When we get "/news/item/111?format=NINJSFormatter&no_embeds=true&no_media=1"
Then we get existing resource
"""
{
"guid": "111",
"headline": "headline 1",
"body_html": "<p>Once upon a time there was </p><p> who could swim</p><p></p>"
}
"""
When we get "/news/item/111?format=NINJSFormatter2&no_embeds=true"
Then we get existing resource
"""
{
"guid": "111",
"headline": "headline 1",
"body_html": "<p>Once upon a time there was </p><p> who could swim</p><p><!-- EMBED START Image {id: \"editor_19\"} --><figure><img src=\"somthing\" alt=\"alt text\" id=\"editor_19\">Some caption</figure><!-- EMBED END Image {id: \"editor_19\"} --></p>",
"associations": {"editor_19": {"renditions": {"original": {}}}}
}
"""
When we get "/news/item/111?format=NINJSFormatter2&no_media=true"
Then we get existing resource
"""
{
"guid": "111",
"headline": "headline 1",
"body_html": "<p>Once upon a time there was </p><div class=\"embed-block\">a fish</div><p> who could swim</p><p></p>",
"associations": {}
}
"""
When we get "/news/item/111?format=NINJSFormatter3"
Then we get existing resource
"""
{
"guid": "111",
"headline": "headline 1",
"body_html": "<p>Once upon a time there was </p><p> who could swim</p><p></p>",
"associations": {}
}
"""
29 changes: 1 addition & 28 deletions newsroom/monitoring/utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
from flask import current_app as app
from lxml import html as lxml_html
import re
import collections
from superdesk.text_utils import get_text
from newsroom.utils import get_items_by_id
from newsroom.utils import get_items_by_id, remove_all_embeds
from superdesk import etree as sd_etree


Expand Down Expand Up @@ -69,28 +67,3 @@ def get_items_for_monitoring_report(_ids, monitoring_profile, full_text=False):
items = get_items_by_id(_ids, 'items')
truncate_article_body(items, monitoring_profile, full_text)
return items


def remove_all_embeds(item):
"""
Remove the all embeds from the body of the article
:param item:
:return:
"""
root_elem = lxml_html.fromstring(item.get('body_html') or '<p></p>')
regex = r" EMBED START (?:Image|Video|Audio) {id: \"editor_([0-9]+)"
html_updated = False
comments = root_elem.xpath('//comment()')
for comment in comments:
m = re.search(regex, comment.text)
# if we've found an Embed Start comment
if m and m.group(1):
parent = comment.getparent()
for elem in comment.itersiblings():
parent.remove(elem)
if elem.text and ' EMBED END ' in elem.text:
break
parent.remove(comment)
html_updated = True
if html_updated:
item["body_html"] = sd_etree.to_string(root_elem, method="html")
37 changes: 37 additions & 0 deletions newsroom/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pytz
import re
from lxml import html as lxml_html
from lxml.html import clean

from superdesk.etree import to_string
from superdesk.utc import utcnow
Expand Down Expand Up @@ -460,3 +461,39 @@ def update_embeds_in_body(item, update_image=None, update_audio=None, update_vid
body_updated = update_video(item, elem, m.group(1)) or body_updated
if body_updated:
item['body_html'] = to_string(root_elem, method="html")


def remove_all_embeds(item, remove_by_class=True, remove_media_embeds=True):
"""
Remove the all embeds from the body of the article, including any divs with the embed_block attribute
:param item:
:param remove_by_class: If true removes any divs that have the embed-block class, should remove such things as
embedded tweets
:param remove_media_embeds: Remove any figure tags if the passed value is true
:return:
"""
if not item.get("body_html", ""):
return

root_elem = lxml_html.fromstring(item.get("body_html", ""))

if remove_by_class:
# all embedded tweets etc should be in a div with the class embeded-block, these are removed
embeds = root_elem.xpath('//div[@class=\'embed-block\']')
for embed in embeds:
embed.getparent().remove(embed)

if not remove_media_embeds:
item["body_html"] = to_string(root_elem, encoding="unicode", method='html')
return

# clean all the embedded figures from the html, it will remove the comments as well
cleaner = clean.Cleaner(add_nofollow=False, kill_tags=["figure"])
cleaned_xhtml = cleaner.clean_html(root_elem)

# remove the associations relating to the embeds
kill_keys = [key for key in item.get("associations", {}) if key.startswith("editor_")]
for key in kill_keys:
item.get("associations", {}).pop(key, None)

item["body_html"] = to_string(cleaned_xhtml, encoding="unicode", method='html')
16 changes: 2 additions & 14 deletions newsroom/wire/formatters/html.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import flask
from .base import BaseFormatter
from lxml import html as lxml_html
from lxml.html import clean
from lxml import etree
from newsroom.utils import remove_all_embeds


class HTMLFormatter(BaseFormatter):
Expand All @@ -14,17 +12,7 @@ class HTMLFormatter(BaseFormatter):
MIMETYPE = 'text/html'

def format_item(self, item, item_type='items'):

# clean all the embedded figures from the html
blacklist = ["figure"]
root_elem = lxml_html.fromstring(item.get("body_html", ""))
cleaner = clean.Cleaner(
add_nofollow=False,
kill_tags=blacklist
)
cleaned_xhtml = cleaner.clean_html(root_elem)

item["body_html"] = etree.tostring(cleaned_xhtml, encoding="unicode", method='html')
remove_all_embeds(item)

if item_type == 'items':
return str.encode(flask.render_template('download_item.html', item=item), 'utf-8')
Expand Down
2 changes: 2 additions & 0 deletions newsroom/wire/formatters/newsmlg2.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from superdesk.publish.formatters.nitf_formatter import NITFFormatter
from superdesk.publish.formatters.newsml_g2_formatter import NewsMLG2Formatter as SuperdeskFormatter
from newsroom.utils import remove_all_embeds

from .base import BaseFormatter

Expand Down Expand Up @@ -34,6 +35,7 @@ class NewsMLG2Formatter(BaseFormatter):
nitf_formatter = NITFFormatter()

def format_item(self, item, item_type='items'):
remove_all_embeds(item)
item = item.copy()
item.setdefault('guid', item['_id'])
item.setdefault('_current_version', item['version'])
Expand Down
16 changes: 16 additions & 0 deletions newsroom/wire/formatters/ninjs.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import flask
import json
from .base import BaseFormatter
from superdesk.utils import json_serialize_datetime_objectId
from newsroom.utils import remove_all_embeds


class NINJSFormatter(BaseFormatter):
Expand All @@ -20,7 +22,21 @@ def format_item(self, item, item_type='items'):

return json.dumps(ninjs, default=json_serialize_datetime_objectId)

@staticmethod
def test_for_true(value):
"""
Test if the value indicates false
:param value:
:return:
"""
return value.lower() == 'true' or value == '1'

def _transform_to_ninjs(self, item):
no_embeds = flask.request.args.get('no_embeds', default=False, type=self.test_for_true)
no_media = flask.request.args.get('no_media', default=False, type=self.test_for_true)
if no_media or no_embeds:
remove_all_embeds(item, remove_media_embeds=no_media, remove_by_class=no_embeds)

ninjs = {
'guid': item.get('_id'),
'version': str(item.get('version', 1)),
Expand Down
12 changes: 12 additions & 0 deletions newsroom/wire/formatters/ninjs2.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from .ninjs import NINJSFormatter
from newsroom.news_api.utils import check_featuremedia_association_permission
from newsroom.wire.formatters.utils import remove_internal_renditions
from newsroom.utils import remove_all_embeds


class NINJSFormatter2(NINJSFormatter):
Expand All @@ -18,3 +19,14 @@ def _transform_to_ninjs(self, item):
if not item.get('associations'):
item.pop('associations', None)
return remove_internal_renditions(super()._transform_to_ninjs(item), remove_media=True)


class NINJSFormatter3(NINJSFormatter2):
"""
Format with no Embeds
"""

def _transform_to_ninjs(self, item):
remove_all_embeds(item)
ninjs = super()._transform_to_ninjs(item)
return ninjs
2 changes: 2 additions & 0 deletions newsroom/wire/formatters/nitf.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@

from lxml import etree
from superdesk.publish.formatters.nitf_formatter import NITFFormatter as SuperdeskNITFFormatter
from newsroom.utils import remove_all_embeds

from .base import BaseFormatter

Expand All @@ -14,6 +15,7 @@ class NITFFormatter(BaseFormatter):
formatter = SuperdeskNITFFormatter()

def format_item(self, item, item_type='items'):
remove_all_embeds(item)
dest = {}
nitf = self.formatter.get_nitf(item, dest, '')
return etree.tostring(nitf, xml_declaration=True, pretty_print=True, encoding=self.encoding)
2 changes: 2 additions & 0 deletions newsroom/wire/formatters/text.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@

import flask
from .base import BaseFormatter
from newsroom.utils import remove_all_embeds


class TextFormatter(BaseFormatter):
Expand All @@ -9,6 +10,7 @@ class TextFormatter(BaseFormatter):
MIMETYPE = 'text/plain'

def format_item(self, item, item_type='items'):
remove_all_embeds(item)
if item_type == 'items':
return str.encode(flask.render_template('download_item.txt', item=item), 'utf-8')
else:
Expand Down
3 changes: 2 additions & 1 deletion newsroom/wire/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,8 @@ def download(_ids):

update_action_list(_ids.split(','), 'downloads', force_insert=True)
get_resource_service('history').create_history_record(items, 'download', user, request.args.get('type', 'wire'))
return flask.send_file(_file, mimetype=mimetype, attachment_filename=attachment_filename, as_attachment=True)
return flask.send_file(_file, mimetype=mimetype, attachment_filename=attachment_filename, as_attachment=True,
cache_timeout=0)


@blueprint.route('/wire_share', methods=['POST'])
Expand Down
Loading