From 9258005c8505b981fe82b337bc7de3d76feddddf Mon Sep 17 00:00:00 2001 From: vhanded Date: Fri, 25 Apr 2014 16:42:15 +0800 Subject: [PATCH 1/2] Added feature to get the largest image, and also grab image size from style attribute, if available --- lib/readability.rb | 40 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/lib/readability.rb b/lib/readability.rb index 1de9244..2ef69f2 100644 --- a/lib/readability.rb +++ b/lib/readability.rb @@ -17,7 +17,8 @@ class Document :min_image_height => 80, :ignore_image_format => [], :blacklist => nil, - :whitelist => nil + :whitelist => nil, + :get_largest_image => false }.freeze attr_accessor :options, :html, :best_candidate, :candidates, :best_candidate_has_image @@ -90,6 +91,9 @@ def images(content=nil, reload=false) @best_candidate_has_image = false if reload + largest_image_url = nil + largest_image_area = 0 + prepare_candidates list_images = [] tested_images = [] @@ -105,6 +109,22 @@ def images(content=nil, reload=false) height = element["height"].nil? ? 0 : element["height"].value.to_i width = element["width"].nil? ? 0 : element["width"].value.to_i + if element["style"] + + width_reg = /width:(\d+)/.match(element["style"]) + height_reg = /height:(\d+)/.match(element["style"]) + + if width_reg + width = width_reg[1].to_i + end + + if height_reg + height = height_reg[1].to_i + end + + end + + if url =~ /\Ahttps?:\/\//i && (height.zero? || width.zero?) image = get_image_size(url) next unless image @@ -121,13 +141,27 @@ def images(content=nil, reload=false) tested_images.push(url) if image_meets_criteria?(image) - list_images << url + if options[:get_largest_image] + area = image[:height] * image[:width] + if area > largest_image_area + largest_image_area = area + largest_image_url = url + end + else + list_images << url + end + else debug("Image discarded: #{url} - height: #{image[:height]} - width: #{image[:width]} - format: #{image[:format]}") end end - (list_images.empty? and content != @html) ? images(@html, true) : list_images + if options[:get_largest_image] and largest_image_url + list_images << largest_image_url + end + + (list_images.empty? and content != @html) ? images(@html, true) : list_images + end def get_image_size(url) From 52eb8c5040cf1d3137f4cc6078eb0ef544295e31 Mon Sep 17 00:00:00 2001 From: vhanded Date: Mon, 28 Apr 2014 21:34:20 +0800 Subject: [PATCH 2/2] Ability to support black list image url --- lib/readability.rb | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/lib/readability.rb b/lib/readability.rb index 2ef69f2..d8b2ef1 100644 --- a/lib/readability.rb +++ b/lib/readability.rb @@ -18,7 +18,8 @@ class Document :ignore_image_format => [], :blacklist => nil, :whitelist => nil, - :get_largest_image => false + :get_largest_image => false, + :url_blacklist => [] }.freeze attr_accessor :options, :html, :best_candidate, :candidates, :best_candidate_has_image @@ -82,6 +83,18 @@ def make_html(whitelist=nil, blacklist=nil) @html.xpath('//comment()').each { |i| i.remove } end + + def is_blacklist(url) + options[:url_blacklist].each do |blacklist| + if url.include? blacklist + return true + end + end + + return false + end + + def images(content=nil, reload=false) begin require 'fastimage' @@ -109,6 +122,11 @@ def images(content=nil, reload=false) height = element["height"].nil? ? 0 : element["height"].value.to_i width = element["width"].nil? ? 0 : element["width"].value.to_i + if is_blacklist(url) + debug("image discarded (blacklist): #{url}") + next + end + if element["style"] width_reg = /width:(\d+)/.match(element["style"]) @@ -144,6 +162,11 @@ def images(content=nil, reload=false) if options[:get_largest_image] area = image[:height] * image[:width] if area > largest_image_area + + if largest_image_url + debug("Image discarded by larger image: #{largest_image_url}") + end + largest_image_area = area largest_image_url = url end @@ -169,7 +192,7 @@ def get_image_size(url) raise "Couldn't get size." if w.nil? || h.nil? {:width => w, :height => h} rescue => e - debug("Image error: #{e}") + debug("Image error: #{e} url: #{url}") nil end