Skip to content

Commit

Permalink
Feature gate the “ignore redundant nesting for likely siblings” behav…
Browse files Browse the repository at this point in the history
…iour
  • Loading branch information
tuzz committed Aug 27, 2024
1 parent 1e82cd3 commit 71e1a0d
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 3 deletions.
14 changes: 12 additions & 2 deletions lib/readability.rb
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ class Document
:blacklist => nil,
:whitelist => nil,
:elements_to_score => ["p", "td", "pre"],
:likely_siblings => ["p"]
:likely_siblings => ["p"],
:ignore_redundant_nesting => false
}.freeze

REGEXES = {
Expand Down Expand Up @@ -264,7 +265,16 @@ def get_article(candidates, best_candidate)
sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max
downcased_likely_siblings = options[:likely_siblings].map(&:downcase)
output = Nokogiri::XML::Node.new('div', @html)
node = closest_node_with_siblings(best_candidate[:elem])

# If the best candidate is the only element in its parent then we will never find any siblings. Therefore,
# find the closest ancestor that has siblings (if :ignore_redundant_nesting is true). This improves the
# related content detection, but could lead to false positives. Not supported in arc90's readability.
node =
if options[:ignore_redundant_nesting]
closest_node_with_siblings(best_candidate[:elem])
else
best_candidate[:elem] # This is the default behaviour for consistency with arc90's readability.
end

node.parent.children.each do |sibling|
append = false
Expand Down
2 changes: 1 addition & 1 deletion spec/readability_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -518,7 +518,7 @@
end

it "climbs the DOM tree to the closest ancestor that has siblings when checking for related siblings" do
@doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"], likely_siblings: ["section"])
@doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"], likely_siblings: ["section"], ignore_redundant_nesting: true)
<html>
<head>
<title>title!</title>
Expand Down

0 comments on commit 71e1a0d

Please sign in to comment.