Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ignore redundant nesting when checking for related siblings #98

Merged
merged 2 commits into from
Aug 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 32 additions & 3 deletions lib/readability.rb
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ class Document
:blacklist => nil,
:whitelist => nil,
:elements_to_score => ["p", "td", "pre"],
:likely_siblings => ["p"]
:likely_siblings => ["p"],
:ignore_redundant_nesting => false
}.freeze

REGEXES = {
Expand Down Expand Up @@ -264,9 +265,20 @@ def get_article(candidates, best_candidate)
sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max
downcased_likely_siblings = options[:likely_siblings].map(&:downcase)
output = Nokogiri::XML::Node.new('div', @html)
best_candidate[:elem].parent.children.each do |sibling|

# If the best candidate is the only element in its parent then we will never find any siblings. Therefore,
# find the closest ancestor that has siblings (if :ignore_redundant_nesting is true). This improves the
# related content detection, but could lead to false positives. Not supported in arc90's readability.
node =
if options[:ignore_redundant_nesting]
closest_node_with_siblings(best_candidate[:elem])
else
best_candidate[:elem] # This is the default behaviour for consistency with arc90's readability.
end

node.parent.children.each do |sibling|
append = false
append = true if sibling == best_candidate[:elem]
append = true if sibling == node
append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold

if downcased_likely_siblings.include?(sibling.name.downcase)
Expand All @@ -291,6 +303,23 @@ def get_article(candidates, best_candidate)
output
end

def closest_node_with_siblings(element)
node = element

until node.node_name == 'body'
siblings = node.parent.children
non_empty = siblings.reject { |sibling| sibling.text? && sibling.text.strip.empty? }

if non_empty.size > 1
return node
else
node = node.parent
end
end

node
end

def select_best_candidate(candidates)
sorted_candidates = candidates.values.sort { |a, b| b[:content_score] <=> a[:content_score] }

Expand Down
29 changes: 29 additions & 0 deletions spec/readability_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -516,6 +516,35 @@
expect(@doc.content).to include("should be included")
expect(@doc.content).not_to include("too short when stripped")
end

it "climbs the DOM tree to the closest ancestor that has siblings when checking for related siblings" do
@doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"], likely_siblings: ["section"], ignore_redundant_nesting: true)
<html>
<head>
<title>title!</title>
</head>
<body>
<div> <!-- This is the closest node of the best candidate that has siblings. -->
<div>
<section>
<p>Paragraph 1</p>
#{'<p>Paragraph 2</p>' * 10} <!-- Ensure this section remains the best_candidate. -->
</section>
</div>
</div>
<section>
<p>This paragraph is longer than 80 characters and inside a section that is a sibling of the ancestor node.</p>
<p>The likely_siblings now include the section tag so it should be included in the output.</p>
</section>
#{'<a href="/">This link lowers the body score.</a>' * 5}
</body>
</html>
HTML

expect(@doc.content).to include("Paragraph 1")
expect(@doc.content).to include("Paragraph 2")
expect(@doc.content).to include("should be included")
end
end

describe "the cant_read.html fixture" do
Expand Down
Loading