diff --git a/README.md b/README.md index 6813625..a11c46a 100644 --- a/README.md +++ b/README.md @@ -41,7 +41,7 @@ You may provide options to `Readability::Document.new`, including: * `:remove_empty_nodes`: remove `
` tags that have no text content; also removes `
` tags that contain only images;
* `:attributes`: whitelist of allowed attributes;
-* `:debug`: provide debugging output, defaults false;
+* `:debug`: provide debugging output, defaults false; supports setting a Proc;
* `:encoding`: if the page is of a known encoding, you can specify it; if left
unspecified, the encoding will be guessed (only in Ruby 1.9.x). If you wish
to disable guessing, supply `:do_not_guess_encoding => true`;
diff --git a/lib/readability.rb b/lib/readability.rb
index 55b875b..a94f310 100644
--- a/lib/readability.rb
+++ b/lib/readability.rb
@@ -21,7 +21,7 @@ class Document
:elements_to_score => ["p", "td", "pre"],
:likely_siblings => ["p"]
}.freeze
-
+
REGEXES = {
:unlikelyCandidatesRe => /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
:okMaybeItsACandidateRe => /and|article|body|column|main|shadow/i,
@@ -35,7 +35,7 @@ class Document
:killBreaksRe => /(
(\s| ?)*){1,}/,
:videoRe => /http:\/\/(www\.)?(youtube|vimeo)\.com/i
}
-
+
attr_accessor :options, :html, :best_candidate, :candidates, :best_candidate_has_image
def initialize(input, options = {})
@@ -50,7 +50,7 @@ def initialize(input, options = {})
@input = @input.gsub(REGEXES[:replaceBrsRe], '
').gsub(REGEXES[:replaceFontsRe], '<\1span>')
@remove_unlikely_candidates = @options[:remove_unlikely_candidates]
@weight_classes = @options[:weight_classes]
- @clean_conditionally = @options[:clean_conditionally]
+ @clean_conditionally = !!@options[:clean_conditionally]
@best_candidate_has_image = true
make_html
handle_exclusions!(@options[:whitelist], @options[:blacklist])
@@ -145,11 +145,11 @@ def images(content=nil, reload=false)
(list_images.empty? and content != @html) ? images(@html, true) : list_images
end
-
+
def images_with_fqdn_uris!(source_uri)
images_with_fqdn_uris(@html, source_uri)
end
-
+
def images_with_fqdn_uris(document = @html.dup, source_uri)
uri = URI.parse(source_uri)
host = uri.host
@@ -161,7 +161,7 @@ def images_with_fqdn_uris(document = @html.dup, source_uri)
images = []
document.css("img").each do |elem|
begin
- elem['src'] = URI.join(base,elem['src']).to_s if URI.parse(elem['src']).host == nil
+ elem['src'] = URI.join(base,elem['src']).to_s if URI.parse(elem['src']).host == nil
images << elem['src'].to_s
rescue URI::InvalidURIError => exc
elem.remove
@@ -271,7 +271,7 @@ def get_article(candidates, best_candidate)
if downcased_likely_siblings.include?(sibling.name.downcase)
link_density = get_link_density(sibling)
- node_content = sibling.text
+ node_content = sibling.text.strip
node_length = node_content.length
append = if node_length > 80 && link_density < 0.25
@@ -372,7 +372,11 @@ def score_node(elem)
end
def debug(str)
- puts str if options[:debug]
+ if options[:debug].respond_to?(:call)
+ options[:debug].call(str)
+ elsif options[:debug]
+ puts str
+ end
end
def remove_unlikely_candidates!
@@ -426,7 +430,8 @@ def sanitize(node, candidates, options = {})
# We'll sanitize all elements using a whitelist
base_whitelist = @options[:tags] || %w[div p]
- all_whitelisted = base_whitelist.include?("*")
+ all_tags_whitelisted = base_whitelist.include?("*")
+ all_attr_whitelisted = @options[:attributes] && @options[:attributes].include?("*")
# We'll add whitespace instead of block elements,
# so a
b will have a nice space between them
@@ -440,8 +445,8 @@ def sanitize(node, candidates, options = {})
([node] + node.css("*")).each do |el|
# If element is in whitelist, delete all its attributes
- if all_whitelisted || whitelist[el.node_name]
- el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) }
+ if all_tags_whitelisted || whitelist[el.node_name]
+ el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) } unless all_attr_whitelisted
# Otherwise, replace the element with its contents
else
@@ -470,30 +475,43 @@ def sanitize(node, candidates, options = {})
def clean_conditionally(node, candidates, selector)
return unless @clean_conditionally
+
node.css(selector).each do |el|
weight = class_weight(el)
content_score = candidates[el] ? candidates[el][:content_score] : 0
name = el.name.downcase
-
+ remove = false
+ message = nil
+
if weight + content_score < 0
- el.remove
- debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero.")
+ remove = true
+ message = "Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero."
elsif el.text.count(",") < 10
counts = %w[p img li a embed input].inject({}) { |m, kind| m[kind] = el.css(kind).length; m }
counts["li"] -= 100
# For every img under a noscript tag discount one from the count to avoid double counting
counts["img"] -= el.css("noscript").css("img").length
-
+
content_length = el.text.strip.length # Count the text length excluding any surrounding whitespace
link_density = get_link_density(el)
reason = clean_conditionally_reason?(name, counts, content_length, options, weight, link_density)
if reason
- debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}.")
- el.remove
+ message = "Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}."
+ remove = true
end
end
+
+ if options[:clean_conditionally].respond_to?(:call)
+ context = { remove: remove, message: message, weight: weight, content_score: content_score, el: el }
+ remove = options[:clean_conditionally].call(context) # Allow the user to override the decision for whether to remove the element.
+ end
+
+ if remove
+ debug(message || "Conditionally cleaned by user-specified function.")
+ el.remove
+ end
end
end
diff --git a/spec/readability_spec.rb b/spec/readability_spec.rb
index 0bcfb8a..7e86354 100644
--- a/spec/readability_spec.rb
+++ b/spec/readability_spec.rb
@@ -115,6 +115,11 @@
expect(@doc.content).to include('')
end
+ it "should be able to whitelist all attributes" do
+ @doc = Readability::Document.new(@nested, attributes: ["*"], tags: ["*"])
+ expect(@doc.content).to include('')
+ end
+
it "should not try to download local images" do
@doc = Readability::Document.new(<<-HTML)
@@ -498,6 +503,9 @@
This paragraph is longer than 80 characters and inside a section that is a sibling of the best_candidate.
The likely_siblings now include the section tag so it should be included in the output.
+too short when stripped
+