Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhance detection of lazy image #910

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
167 changes: 97 additions & 70 deletions Readability.js
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ function Readability(doc, options) {
this._disableJSONLD = !!options.disableJSONLD;
this._allowedVideoRegex = options.allowedVideoRegex || this.REGEXPS.videos;
this._linkDensityModifier = options.linkDensityModifier || 0;
// If true, will always overwrite img src with found data-src attribute.
this._overwriteImgSrc = !!options.overwriteImgSrc;

// Start with all flags set
this._flags =
Expand Down Expand Up @@ -108,6 +110,10 @@ function Readability(doc, options) {
}
}

// Helper: OR multiple regexps to one.
_combineRegExps = (...regexps) =>
new RegExp(regexps.map(regexp => regexp.source).join("|"))

Readability.prototype = {
FLAG_STRIP_UNLIKELYS: 0x1,
FLAG_WEIGHT_CLASSES: 0x2,
Expand Down Expand Up @@ -172,6 +178,15 @@ Readability.prototype = {
/^(ad(vertising|vertisement)?|pub(licité)?|werb(ung)?|广告|Реклама|Anuncio)$/iu,
loadingWords:
/^((loading|正在加载|Загрузка|chargement|cargando)(…|\.\.\.)?)$/iu,
// used to identify img data-src attribute:
imgSrcset:
/\.(jpg|jpeg|png|webp)\s+\d/,
imgSrc: _combineRegExps(
/^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$/,
/^\s*https?:\/\/\S+=(jpg|jpeg|png|webp)\S*\s*$/),
// used to identify lazy img src (aka placeholder)
lazyImgSrc:
/svg\s+(width|height)=['"]?1(px)?['"]?\s+/
},

UNLIKELY_ROLES: [
Expand Down Expand Up @@ -2296,85 +2311,97 @@ Readability.prototype = {
}
},

/* convert images and figures that have properties like data-src into images that can be loaded without JS */
_fixLazyImages(root) {
this._forEachNode(
this._getAllNodesWithTag(root, ["img", "picture", "figure"]),
function (elem) {
// In some sites (e.g. Kotaku), they put 1px square image as base64 data uri in the src attribute.
// So, here we check if the data uri is too short, just might as well remove it.
if (elem.src && this.REGEXPS.b64DataUrl.test(elem.src)) {
// Make sure it's not SVG, because SVG can have a meaningful image in under 133 bytes.
var parts = this.REGEXPS.b64DataUrl.exec(elem.src);
if (parts[1] === "image/svg+xml") {
return;
/**
* Look for the first data-src like property. If found, convert image/figure
* element into image that can be loaded without JS, and return true.
* Otherwise return false.
*/
_fixLazyImage(elem, dry_run) {
for (var j = 0; j < elem.attributes.length; j++) {
attr = elem.attributes[j];
if (
attr.name === "src" ||
attr.name === "srcset" ||
attr.name === "alt"
) {
continue;
}
var copyTo = null;
if (this.REGEXPS.imgSrcset.test(attr.value)) {
copyTo = "srcset";
} else if (this.REGEXPS.imgSrc.test(attr.value)) {
copyTo = "src";
}
if (copyTo) {
if (!dry_run) {
//if this is an img or picture, set the attribute directly
if (elem.tagName === "IMG" || elem.tagName === "PICTURE") {
elem.setAttribute(copyTo, attr.value);
} else if (
elem.tagName === "FIGURE" &&
!this._getAllNodesWithTag(elem, ["img", "picture"]).length
) {
//if the item is a <figure> that does not contain an image or picture, create
//one and place it inside the figure see the nytimes-3 testcase for an example
var img = this._doc.createElement("img");
img.setAttribute(copyTo, attr.value);
elem.appendChild(img);
}
}
return true;
}
}
return false;
},

// Make sure this element has other attributes which contains image.
// If it doesn't, then this src is important and shouldn't be removed.
var srcCouldBeRemoved = false;
for (var i = 0; i < elem.attributes.length; i++) {
var attr = elem.attributes[i];
if (attr.name === "src") {
continue;
}

if (/\.(jpg|jpeg|png|webp)/i.test(attr.value)) {
srcCouldBeRemoved = true;
break;
}
}
/**
* In some sites (e.g. Kotaku, Wechat), they put 1px square image as data uri (base64
* or not) in the src attribute. So, here we check if the data uri is too short, width
* or hight is 1, just might as well remove it.
*/
_maybeRemoveImgSrc(elem) {
if (!elem.src) {
return;
}

// Here we assume if image is less than 100 bytes (or 133B after encoded to base64)
// it will be too small, therefore it might be placeholder image.
if (srcCouldBeRemoved) {
var b64starts = elem.src.search(/base64\s*/i) + 7;
var b64length = elem.src.length - b64starts;
if (b64length < 133) {
elem.removeAttribute("src");
}
}
}
var parts = this.REGEXPS.b64DataUrl.exec(elem.src);
if (parts != null) { // base64 encoded
// Make sure it's not SVG, because SVG can have a meaningful image in under 133 bytes.
if (parts[1] === "image/svg+xml") {
return;
}
// Here we assume if image is less than 100 bytes (or 133B after encoded to base64)
// it will be too small, therefore it might be placeholder image.
var b64starts = elem.src.search(/base64\s*/i) + 7;
var b64length = elem.src.length - b64starts;
if (b64length >= 133) {
return;
}
} else if (!this.REGEXPS.lazyImgSrc.test(elem.src)) {
return;
}

// also check for "null" to work around https://github.com/jsdom/jsdom/issues/2580
if (
(elem.src || (elem.srcset && elem.srcset != "null")) &&
!elem.className.toLowerCase().includes("lazy")
) {
return;
}
if (this._fixLazyImage(elem, true)) { // src could be removed
elem.removeAttribute("src");
}
},

for (var j = 0; j < elem.attributes.length; j++) {
attr = elem.attributes[j];
/* convert images and figures that have properties like data-src into images that can be loaded without JS */
_fixLazyImages(root) {
this._forEachNode(
this._getAllNodesWithTag(root, ["img", "picture", "figure"]),
function (elem) {
if (!this._overwriteImgSrc) { // overwrite is conditional, not forced
this._maybeRemoveImgSrc(elem);
// also check for "null" to work around https://github.com/jsdom/jsdom/issues/2580
if (
attr.name === "src" ||
attr.name === "srcset" ||
attr.name === "alt"
(elem.src || (elem.srcset && elem.srcset != "null")) &&
!elem.className.toLowerCase().includes("lazy")
) {
continue;
}
var copyTo = null;
if (/\.(jpg|jpeg|png|webp)\s+\d/.test(attr.value)) {
copyTo = "srcset";
} else if (/^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$/.test(attr.value)) {
copyTo = "src";
}
if (copyTo) {
//if this is an img or picture, set the attribute directly
if (elem.tagName === "IMG" || elem.tagName === "PICTURE") {
elem.setAttribute(copyTo, attr.value);
} else if (
elem.tagName === "FIGURE" &&
!this._getAllNodesWithTag(elem, ["img", "picture"]).length
) {
//if the item is a <figure> that does not contain an image or picture, create one and place it inside the figure
//see the nytimes-3 testcase for an example
var img = this._doc.createElement("img");
img.setAttribute(copyTo, attr.value);
elem.appendChild(img);
}
return;
}
}
this._fixLazyImage(elem, false);
}
);
},
Expand Down
4 changes: 2 additions & 2 deletions test/test-pages/herald-sun-1/expected.html
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<div id="readability-page-1" class="page">
<div>
<div>
<p><img data-src="http://api.news.com.au/content/1.0/heraldsun/images/1227261885862?format=jpg&amp;group=iphone&amp;size=medium" alt="A new Bill would require telecommunications service providers to store so-called ‘metadat" />
<p><img src="http://api.news.com.au/content/1.0/heraldsun/images/1227261885862?format=jpg&amp;group=iphone&amp;size=medium" data-src="http://api.news.com.au/content/1.0/heraldsun/images/1227261885862?format=jpg&amp;group=iphone&amp;size=medium" alt="A new Bill would require telecommunications service providers to store so-called ‘metadat" />
</p>
<p class="caption">
<span id="imgCaption">A new Bill would require telecommunications service providers to store so-called ‘metadata’ for two years.</span>
Expand Down Expand Up @@ -30,4 +30,4 @@
<p><b>LAURIE OAKES IS THE NINE NETWORK POLITICAL EDITOR </b></p>
</div>
</div>
</div>
</div>
9 changes: 9 additions & 0 deletions test/test-pages/wechat-image/expected-metadata.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"title": "",
"byline": null,
"dir": null,
"excerpt": "This is a simplified Wechat page. The original page is dynamic and complex that JSDOMParser cannot parse.",
"siteName": null,
"publishedTime": null,
"readerable": false
}
4 changes: 4 additions & 0 deletions test/test-pages/wechat-image/expected.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
<div id="readability-page-1" class="page">
<p>This is a simplified Wechat page. The original page is dynamic and complex that JSDOMParser cannot parse.</p>
<p><img src="https://mmbiz.qpic.cn/sz_mmbiz_png/WLexANxQwzON3iaEurTUFzEVpGT9vP0Hp1oUJRh2ftBJVbpFqdVZaxFNesgTyBGiaCh6DEmjicNZiaDFvN5Vdt5JuQ/640?wx_fmt=png&amp;from=appmsg" data-src="https://mmbiz.qpic.cn/sz_mmbiz_png/WLexANxQwzON3iaEurTUFzEVpGT9vP0Hp1oUJRh2ftBJVbpFqdVZaxFNesgTyBGiaCh6DEmjicNZiaDFvN5Vdt5JuQ/640?wx_fmt=png&amp;from=appmsg" /></p>
</div>
10 changes: 10 additions & 0 deletions test/test-pages/wechat-image/source.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
<html>
<body>
<p>This is a simplified Wechat page. The original page is dynamic and complex that JSDOMParser cannot parse.</p>
<p><img
src="data:image/svg+xml,%3C%3Fxml version='1.0' encoding='UTF-8'%3F%3E%3Csvg width='1px' height='1px' viewBox='0 0 1 1' version='1.1' xmlns='http://www.w3.org/2000/svg' xmlns:xlink='http://www.w3.org/1999/xlink'%3E%3Ctitle%3E%3C/title%3E%3Cg stroke='none' stroke-width='1' fill='none' fill-rule='evenodd' fill-opacity='0'%3E%3Cg transform='translate(-249.000000, -126.000000)' fill='%23FFFFFF'%3E%3Crect x='249' y='126' width='1' height='1'%3E%3C/rect%3E%3C/g%3E%3C/g%3E%3C/svg%3E"
data-src="https://mmbiz.qpic.cn/sz_mmbiz_png/WLexANxQwzON3iaEurTUFzEVpGT9vP0Hp1oUJRh2ftBJVbpFqdVZaxFNesgTyBGiaCh6DEmjicNZiaDFvN5Vdt5JuQ/640?wx_fmt=png&from=appmsg"
/></p>
</body>
</html>