diff --git a/lib/readability.rb b/lib/readability.rb index a94f310..4e4309f 100644 --- a/lib/readability.rb +++ b/lib/readability.rb @@ -19,7 +19,8 @@ class Document :blacklist => nil, :whitelist => nil, :elements_to_score => ["p", "td", "pre"], - :likely_siblings => ["p"] + :likely_siblings => ["p"], + :ignore_redundant_nesting => false }.freeze REGEXES = { @@ -264,9 +265,20 @@ def get_article(candidates, best_candidate) sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max downcased_likely_siblings = options[:likely_siblings].map(&:downcase) output = Nokogiri::XML::Node.new('div', @html) - best_candidate[:elem].parent.children.each do |sibling| + + # If the best candidate is the only element in its parent then we will never find any siblings. Therefore, + # find the closest ancestor that has siblings (if :ignore_redundant_nesting is true). This improves the + # related content detection, but could lead to false positives. Not supported in arc90's readability. + node = + if options[:ignore_redundant_nesting] + closest_node_with_siblings(best_candidate[:elem]) + else + best_candidate[:elem] # This is the default behaviour for consistency with arc90's readability. + end + + node.parent.children.each do |sibling| append = false - append = true if sibling == best_candidate[:elem] + append = true if sibling == node append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold if downcased_likely_siblings.include?(sibling.name.downcase) @@ -291,6 +303,23 @@ def get_article(candidates, best_candidate) output end + def closest_node_with_siblings(element) + node = element + + until node.node_name == 'body' + siblings = node.parent.children + non_empty = siblings.reject { |sibling| sibling.text? && sibling.text.strip.empty? } + + if non_empty.size > 1 + return node + else + node = node.parent + end + end + + node + end + def select_best_candidate(candidates) sorted_candidates = candidates.values.sort { |a, b| b[:content_score] <=> a[:content_score] } diff --git a/spec/readability_spec.rb b/spec/readability_spec.rb index 7e86354..5049d40 100644 --- a/spec/readability_spec.rb +++ b/spec/readability_spec.rb @@ -516,6 +516,35 @@ expect(@doc.content).to include("should be included") expect(@doc.content).not_to include("too short when stripped") end + + it "climbs the DOM tree to the closest ancestor that has siblings when checking for related siblings" do + @doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"], likely_siblings: ["section"], ignore_redundant_nesting: true) + + + title! + + +
+
+
+

Paragraph 1

+ #{'

Paragraph 2

' * 10} +
+
+
+
+

This paragraph is longer than 80 characters and inside a section that is a sibling of the ancestor node.

+

The likely_siblings now include the section tag so it should be included in the output.

+
+ #{'This link lowers the body score.' * 5} + + + HTML + + expect(@doc.content).to include("Paragraph 1") + expect(@doc.content).to include("Paragraph 2") + expect(@doc.content).to include("should be included") + end end describe "the cant_read.html fixture" do