diff --git a/README.md b/README.md index 6813625..a11c46a 100644 --- a/README.md +++ b/README.md @@ -41,7 +41,7 @@ You may provide options to `Readability::Document.new`, including: * `:remove_empty_nodes`: remove `

` tags that have no text content; also removes `

` tags that contain only images; * `:attributes`: whitelist of allowed attributes; -* `:debug`: provide debugging output, defaults false; +* `:debug`: provide debugging output, defaults false; supports setting a Proc; * `:encoding`: if the page is of a known encoding, you can specify it; if left unspecified, the encoding will be guessed (only in Ruby 1.9.x). If you wish to disable guessing, supply `:do_not_guess_encoding => true`; diff --git a/lib/readability.rb b/lib/readability.rb index 55b875b..a94f310 100644 --- a/lib/readability.rb +++ b/lib/readability.rb @@ -21,7 +21,7 @@ class Document :elements_to_score => ["p", "td", "pre"], :likely_siblings => ["p"] }.freeze - + REGEXES = { :unlikelyCandidatesRe => /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i, :okMaybeItsACandidateRe => /and|article|body|column|main|shadow/i, @@ -35,7 +35,7 @@ class Document :killBreaksRe => /((\s| ?)*){1,}/, :videoRe => /http:\/\/(www\.)?(youtube|vimeo)\.com/i } - + attr_accessor :options, :html, :best_candidate, :candidates, :best_candidate_has_image def initialize(input, options = {}) @@ -50,7 +50,7 @@ def initialize(input, options = {}) @input = @input.gsub(REGEXES[:replaceBrsRe], '

').gsub(REGEXES[:replaceFontsRe], '<\1span>') @remove_unlikely_candidates = @options[:remove_unlikely_candidates] @weight_classes = @options[:weight_classes] - @clean_conditionally = @options[:clean_conditionally] + @clean_conditionally = !!@options[:clean_conditionally] @best_candidate_has_image = true make_html handle_exclusions!(@options[:whitelist], @options[:blacklist]) @@ -145,11 +145,11 @@ def images(content=nil, reload=false) (list_images.empty? and content != @html) ? images(@html, true) : list_images end - + def images_with_fqdn_uris!(source_uri) images_with_fqdn_uris(@html, source_uri) end - + def images_with_fqdn_uris(document = @html.dup, source_uri) uri = URI.parse(source_uri) host = uri.host @@ -161,7 +161,7 @@ def images_with_fqdn_uris(document = @html.dup, source_uri) images = [] document.css("img").each do |elem| begin - elem['src'] = URI.join(base,elem['src']).to_s if URI.parse(elem['src']).host == nil + elem['src'] = URI.join(base,elem['src']).to_s if URI.parse(elem['src']).host == nil images << elem['src'].to_s rescue URI::InvalidURIError => exc elem.remove @@ -271,7 +271,7 @@ def get_article(candidates, best_candidate) if downcased_likely_siblings.include?(sibling.name.downcase) link_density = get_link_density(sibling) - node_content = sibling.text + node_content = sibling.text.strip node_length = node_content.length append = if node_length > 80 && link_density < 0.25 @@ -372,7 +372,11 @@ def score_node(elem) end def debug(str) - puts str if options[:debug] + if options[:debug].respond_to?(:call) + options[:debug].call(str) + elsif options[:debug] + puts str + end end def remove_unlikely_candidates! @@ -426,7 +430,8 @@ def sanitize(node, candidates, options = {}) # We'll sanitize all elements using a whitelist base_whitelist = @options[:tags] || %w[div p] - all_whitelisted = base_whitelist.include?("*") + all_tags_whitelisted = base_whitelist.include?("*") + all_attr_whitelisted = @options[:attributes] && @options[:attributes].include?("*") # We'll add whitespace instead of block elements, # so a
b will have a nice space between them @@ -440,8 +445,8 @@ def sanitize(node, candidates, options = {}) ([node] + node.css("*")).each do |el| # If element is in whitelist, delete all its attributes - if all_whitelisted || whitelist[el.node_name] - el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) } + if all_tags_whitelisted || whitelist[el.node_name] + el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) } unless all_attr_whitelisted # Otherwise, replace the element with its contents else @@ -470,30 +475,43 @@ def sanitize(node, candidates, options = {}) def clean_conditionally(node, candidates, selector) return unless @clean_conditionally + node.css(selector).each do |el| weight = class_weight(el) content_score = candidates[el] ? candidates[el][:content_score] : 0 name = el.name.downcase - + remove = false + message = nil + if weight + content_score < 0 - el.remove - debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero.") + remove = true + message = "Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero." elsif el.text.count(",") < 10 counts = %w[p img li a embed input].inject({}) { |m, kind| m[kind] = el.css(kind).length; m } counts["li"] -= 100 # For every img under a noscript tag discount one from the count to avoid double counting counts["img"] -= el.css("noscript").css("img").length - + content_length = el.text.strip.length # Count the text length excluding any surrounding whitespace link_density = get_link_density(el) reason = clean_conditionally_reason?(name, counts, content_length, options, weight, link_density) if reason - debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}.") - el.remove + message = "Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}." + remove = true end end + + if options[:clean_conditionally].respond_to?(:call) + context = { remove: remove, message: message, weight: weight, content_score: content_score, el: el } + remove = options[:clean_conditionally].call(context) # Allow the user to override the decision for whether to remove the element. + end + + if remove + debug(message || "Conditionally cleaned by user-specified function.") + el.remove + end end end diff --git a/spec/readability_spec.rb b/spec/readability_spec.rb index 0bcfb8a..7e86354 100644 --- a/spec/readability_spec.rb +++ b/spec/readability_spec.rb @@ -115,6 +115,11 @@ expect(@doc.content).to include('') end + it "should be able to whitelist all attributes" do + @doc = Readability::Document.new(@nested, attributes: ["*"], tags: ["*"]) + expect(@doc.content).to include('') + end + it "should not try to download local images" do @doc = Readability::Document.new(<<-HTML) @@ -498,6 +503,9 @@

This paragraph is longer than 80 characters and inside a section that is a sibling of the best_candidate.

The likely_siblings now include the section tag so it should be included in the output.

+
+

too short when stripped

+
#{'This link lowers the body score.' * 5} @@ -506,6 +514,7 @@ expect(@doc.content).to include("Paragraph 1") expect(@doc.content).to include("Paragraph 2") expect(@doc.content).to include("should be included") + expect(@doc.content).not_to include("too short when stripped") end end @@ -739,11 +748,33 @@ end describe "clean_conditionally_reason?" do - let (:list_fixture) { "

test

#{'
  • ' * 102}" } + let(:list_fixture) { "

    test

    #{'
  • ' * 102}" } it "does not raise error" do @doc = Readability::Document.new(list_fixture) expect { @doc.content }.to_not raise_error end end + + describe "clean_conditionally" do + let(:fixture) { "title!

    Some content

    " } + + it "can set a clean_conditionally function to allow overriding the default decision" do + clean_conditionally_fn = lambda { |context| !context[:remove] } # Flip the decision. + content = Readability::Document.new(fixture, clean_conditionally: clean_conditionally_fn, min_text_length: 0, retry_length: 1).content + + expect(content).to include("sidebar") + expect(content).not_to include('Some content') + end + end + + describe "debug" do + it "can set a debug function, e.g. to send output to Rails logger" do + output = [] + debug_fn = lambda { |str| output << str } + + Readability::Document.new(@simple_html_fixture, debug: debug_fn).content + expect(output).not_to be_empty + end + end end