forked from abak-press/string_tools
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
6aebc67
commit e48da9f
Showing
6 changed files
with
170 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,3 +7,4 @@ | |
/pkg/ | ||
/spec/reports/ | ||
/tmp/ | ||
/gemfiles |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,3 +2,4 @@ source 'https://rubygems.org' | |
|
||
# Specify your gem's dependencies in string_tools.gemspec | ||
gemspec | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
# coding: utf-8 | ||
require 'loofah' | ||
require 'uri' | ||
|
||
module StringTools | ||
module HTML | ||
# Public: Удаляет ссылки на неразрешенные домены | ||
# | ||
# html - String содержимое потенциально ненужных ссылок | ||
# options - Hash | ||
# :whitelist - Array of String разрешенныe домены | ||
# | ||
# Examples | ||
# html = '<a href="https://www.yandex.ru">yandex</a>' | ||
# | ||
# StringTools::HTML.remove_links(html, whitelist: ['google.com']) | ||
# # => 'yandex' | ||
# | ||
# StringTools::HTML.remove_links(html, whitelist: ['yandex.ru']) | ||
# # => '<a href="https://www.yandex.ru">yandex</a>' | ||
# | ||
# StringTools::HTML.remove_links(html, whitelist: ['www.yandex.ru']) | ||
# # => '<a href="https://www.yandex.ru">yandex</a>' | ||
# | ||
# html = '<a href="https://yandex.ru">yandex</a>' | ||
# | ||
# StringTools::HTML.remove_links(html, whitelist: ['www.yandex.ru']) | ||
# # => 'yandex' | ||
# | ||
# Returns String without links to external resources | ||
def self.remove_links(html, options = {}) | ||
Loofah.fragment(html).scrub!(LinksRemoveScrubber.new(options)).to_s | ||
end | ||
|
||
class LinksRemoveScrubber < Loofah::Scrubber | ||
def initialize(options) | ||
@whitelist = options.fetch(:whitelist) | ||
end | ||
|
||
def scrub(node) | ||
return unless node.name == 'a'.freeze | ||
uri = URI.parse(node['href'.freeze]) | ||
node.swap(node.children) unless whitelisted? uri.host | ||
rescue URI::InvalidURIError => _ | ||
node.swap(node.children) | ||
end | ||
|
||
def whitelisted?(domain) | ||
host_parts = domain.split('.'.freeze).reverse! | ||
host = host_parts[0] # com, ru ... | ||
1.upto(host_parts.length - 1) do |i| | ||
subdomain = host_parts[i] | ||
host = "#{subdomain}.#{host}" | ||
return true if @whitelist.include? host | ||
end | ||
false | ||
end | ||
end | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
# coding: utf-8 | ||
require 'spec_helper' | ||
|
||
describe StringTools::HTML do | ||
describe '.remove_external_links' do | ||
context 'whitelist option empty' do | ||
subject { StringTools::HTML.remove_links(html, whitelist: []) } | ||
|
||
context 'content without links' do | ||
let(:html) { ' <b>hello</b> <script>alert("world")</script> ' } | ||
|
||
it 'should return html as is' do | ||
is_expected.to eq html | ||
end | ||
end | ||
|
||
context 'content with links' do | ||
let(:html) do | ||
<<-MARKUP | ||
<a href="https://google.com"><span>goo</span><span>gle</span></a> | ||
<a href="https://yandex.ru"><span>yan</span><span>dex</span></a> | ||
MARKUP | ||
end | ||
|
||
it 'should return markup without links' do | ||
is_expected.to eq(<<-MARKUP) | ||
<span>goo</span><span>gle</span> | ||
<span>yan</span><span>dex</span> | ||
MARKUP | ||
end | ||
end | ||
|
||
context 'content with recursive markup' do | ||
let(:html) do | ||
<<-MARKUP | ||
<a href="https://google.com"><a href="https://google.com">goo</a><span>gle</span></a> | ||
<a href="https://yandex.ru"><span>yan</span><span>dex</span></a> | ||
MARKUP | ||
end | ||
|
||
it 'should return content without links' do | ||
is_expected.to eq(<<-MARKUP) | ||
goo<span>gle</span> | ||
<span>yan</span><span>dex</span> | ||
MARKUP | ||
end | ||
end | ||
end | ||
|
||
context 'when whitelist passed' do | ||
subject { StringTools::HTML.remove_links(html, whitelist: ['yandex.ru']) } | ||
|
||
context 'domain link match to whitelisted' do | ||
let(:html) do | ||
<<-MARKUP | ||
<a href="https://google.com"><span>goo</span><span>gle</span></a> | ||
<a href="https://yandex.ru"><span>yan</span><span>dex</span></a> | ||
MARKUP | ||
end | ||
|
||
it 'should keep only whitelisted links' do | ||
is_expected.to eq(<<-MARKUP) | ||
<span>goo</span><span>gle</span> | ||
<a href="https://yandex.ru"><span>yan</span><span>dex</span></a> | ||
MARKUP | ||
end | ||
end | ||
|
||
context 'link domain is subdomain of whitelisted' do | ||
let(:html) do | ||
<<-MARKUP | ||
<a href="https://google.com"><span>goo</span><span>gle</span></a> | ||
<a href="https://www.yandex.ru"><span>yan</span><span>dex</span></a> | ||
MARKUP | ||
end | ||
|
||
it 'should keep only whitelisted links' do | ||
is_expected.to eq(<<-MARKUP) | ||
<span>goo</span><span>gle</span> | ||
<a href="https://www.yandex.ru"><span>yan</span><span>dex</span></a> | ||
MARKUP | ||
end | ||
end | ||
|
||
context 'link domain is parent domain of whitelisted' do | ||
subject { StringTools::HTML.remove_links(html, whitelist: ['www.yandex.ru']) } | ||
|
||
let(:html) do | ||
<<-MARKUP | ||
<a href="https://google.com"><span>goo</span><span>gle</span></a> | ||
<a href="https://yandex.ru"><span>yan</span><span>dex</span></a> | ||
MARKUP | ||
end | ||
|
||
it 'should remove link' do | ||
is_expected.to eq(<<-MARKUP) | ||
<span>goo</span><span>gle</span> | ||
<span>yan</span><span>dex</span> | ||
MARKUP | ||
end | ||
end | ||
end | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters