From 71e1a0d750d8de317606b53662f11aa793839d41 Mon Sep 17 00:00:00 2001 From: Chris Patuzzo Date: Tue, 27 Aug 2024 12:08:03 +0100 Subject: [PATCH] =?UTF-8?q?Feature=20gate=20the=20=E2=80=9Cignore=20redund?= =?UTF-8?q?ant=20nesting=20for=20likely=20siblings=E2=80=9D=20behaviour?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lib/readability.rb | 14 ++++++++++++-- spec/readability_spec.rb | 2 +- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/lib/readability.rb b/lib/readability.rb index e289699..4e4309f 100644 --- a/lib/readability.rb +++ b/lib/readability.rb @@ -19,7 +19,8 @@ class Document :blacklist => nil, :whitelist => nil, :elements_to_score => ["p", "td", "pre"], - :likely_siblings => ["p"] + :likely_siblings => ["p"], + :ignore_redundant_nesting => false }.freeze REGEXES = { @@ -264,7 +265,16 @@ def get_article(candidates, best_candidate) sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max downcased_likely_siblings = options[:likely_siblings].map(&:downcase) output = Nokogiri::XML::Node.new('div', @html) - node = closest_node_with_siblings(best_candidate[:elem]) + + # If the best candidate is the only element in its parent then we will never find any siblings. Therefore, + # find the closest ancestor that has siblings (if :ignore_redundant_nesting is true). This improves the + # related content detection, but could lead to false positives. Not supported in arc90's readability. + node = + if options[:ignore_redundant_nesting] + closest_node_with_siblings(best_candidate[:elem]) + else + best_candidate[:elem] # This is the default behaviour for consistency with arc90's readability. + end node.parent.children.each do |sibling| append = false diff --git a/spec/readability_spec.rb b/spec/readability_spec.rb index 0b6220a..5049d40 100644 --- a/spec/readability_spec.rb +++ b/spec/readability_spec.rb @@ -518,7 +518,7 @@ end it "climbs the DOM tree to the closest ancestor that has siblings when checking for related siblings" do - @doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"], likely_siblings: ["section"]) + @doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"], likely_siblings: ["section"], ignore_redundant_nesting: true) title!