diff --git a/lib/readability.rb b/lib/readability.rb index e289699..4e4309f 100644 --- a/lib/readability.rb +++ b/lib/readability.rb @@ -19,7 +19,8 @@ class Document :blacklist => nil, :whitelist => nil, :elements_to_score => ["p", "td", "pre"], - :likely_siblings => ["p"] + :likely_siblings => ["p"], + :ignore_redundant_nesting => false }.freeze REGEXES = { @@ -264,7 +265,16 @@ def get_article(candidates, best_candidate) sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max downcased_likely_siblings = options[:likely_siblings].map(&:downcase) output = Nokogiri::XML::Node.new('div', @html) - node = closest_node_with_siblings(best_candidate[:elem]) + + # If the best candidate is the only element in its parent then we will never find any siblings. Therefore, + # find the closest ancestor that has siblings (if :ignore_redundant_nesting is true). This improves the + # related content detection, but could lead to false positives. Not supported in arc90's readability. + node = + if options[:ignore_redundant_nesting] + closest_node_with_siblings(best_candidate[:elem]) + else + best_candidate[:elem] # This is the default behaviour for consistency with arc90's readability. + end node.parent.children.each do |sibling| append = false diff --git a/spec/readability_spec.rb b/spec/readability_spec.rb index 0b6220a..5049d40 100644 --- a/spec/readability_spec.rb +++ b/spec/readability_spec.rb @@ -518,7 +518,7 @@ end it "climbs the DOM tree to the closest ancestor that has siblings when checking for related siblings" do - @doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"], likely_siblings: ["section"]) + @doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"], likely_siblings: ["section"], ignore_redundant_nesting: true) title!