Skip to content

Commit

Permalink
feature: удаление ссылок из текста
Browse files Browse the repository at this point in the history
  • Loading branch information
DmitryBochkarev committed Oct 13, 2015
1 parent 6aebc67 commit e48da9f
Show file tree
Hide file tree
Showing 6 changed files with 170 additions and 1 deletion.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@
/pkg/
/spec/reports/
/tmp/
/gemfiles
1 change: 1 addition & 0 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@ source 'https://rubygems.org'

# Specify your gem's dependencies in string_tools.gemspec
gemspec

2 changes: 2 additions & 0 deletions lib/string_tools.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
require 'string_tools/core_ext/string'

module StringTools
autoload :HTML, 'string_tools/html'

module CharDet
# Возвращает true если строка содержит допустимую
# последовательность байтов для кодировки utf8 и false в обратном случае
Expand Down
60 changes: 60 additions & 0 deletions lib/string_tools/html.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# coding: utf-8
require 'loofah'
require 'uri'

module StringTools
module HTML
# Public: Удаляет ссылки на неразрешенные домены
#
# html - String содержимое потенциально ненужных ссылок
# options - Hash
# :whitelist - Array of String разрешенныe домены
#
# Examples
# html = '<a href="https://www.yandex.ru">yandex</a>'
#
# StringTools::HTML.remove_links(html, whitelist: ['google.com'])
# # => 'yandex'
#
# StringTools::HTML.remove_links(html, whitelist: ['yandex.ru'])
# # => '<a href="https://www.yandex.ru">yandex</a>'
#
# StringTools::HTML.remove_links(html, whitelist: ['www.yandex.ru'])
# # => '<a href="https://www.yandex.ru">yandex</a>'
#
# html = '<a href="https://yandex.ru">yandex</a>'
#
# StringTools::HTML.remove_links(html, whitelist: ['www.yandex.ru'])
# # => 'yandex'
#
# Returns String without links to external resources
def self.remove_links(html, options = {})
Loofah.fragment(html).scrub!(LinksRemoveScrubber.new(options)).to_s
end

class LinksRemoveScrubber < Loofah::Scrubber
def initialize(options)
@whitelist = options.fetch(:whitelist)
end

def scrub(node)
return unless node.name == 'a'.freeze
uri = URI.parse(node['href'.freeze])
node.swap(node.children) unless whitelisted? uri.host
rescue URI::InvalidURIError => _
node.swap(node.children)
end

def whitelisted?(domain)
host_parts = domain.split('.'.freeze).reverse!
host = host_parts[0] # com, ru ...
1.upto(host_parts.length - 1) do |i|
subdomain = host_parts[i]
host = "#{subdomain}.#{host}"
return true if @whitelist.include? host
end
false
end
end
end
end
104 changes: 104 additions & 0 deletions spec/html_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
# coding: utf-8
require 'spec_helper'

describe StringTools::HTML do
describe '.remove_external_links' do
context 'whitelist option empty' do
subject { StringTools::HTML.remove_links(html, whitelist: []) }

context 'content without links' do
let(:html) { ' <b>hello</b> <script>alert("world")</script> ' }

it 'should return html as is' do
is_expected.to eq html
end
end

context 'content with links' do
let(:html) do
<<-MARKUP
<a href="https://google.com"><span>goo</span><span>gle</span></a>
<a href="https://yandex.ru"><span>yan</span><span>dex</span></a>
MARKUP
end

it 'should return markup without links' do
is_expected.to eq(<<-MARKUP)
<span>goo</span><span>gle</span>
<span>yan</span><span>dex</span>
MARKUP
end
end

context 'content with recursive markup' do
let(:html) do
<<-MARKUP
<a href="https://google.com"><a href="https://google.com">goo</a><span>gle</span></a>
<a href="https://yandex.ru"><span>yan</span><span>dex</span></a>
MARKUP
end

it 'should return content without links' do
is_expected.to eq(<<-MARKUP)
goo<span>gle</span>
<span>yan</span><span>dex</span>
MARKUP
end
end
end

context 'when whitelist passed' do
subject { StringTools::HTML.remove_links(html, whitelist: ['yandex.ru']) }

context 'domain link match to whitelisted' do
let(:html) do
<<-MARKUP
<a href="https://google.com"><span>goo</span><span>gle</span></a>
<a href="https://yandex.ru"><span>yan</span><span>dex</span></a>
MARKUP
end

it 'should keep only whitelisted links' do
is_expected.to eq(<<-MARKUP)
<span>goo</span><span>gle</span>
<a href="https://yandex.ru"><span>yan</span><span>dex</span></a>
MARKUP
end
end

context 'link domain is subdomain of whitelisted' do
let(:html) do
<<-MARKUP
<a href="https://google.com"><span>goo</span><span>gle</span></a>
<a href="https://www.yandex.ru"><span>yan</span><span>dex</span></a>
MARKUP
end

it 'should keep only whitelisted links' do
is_expected.to eq(<<-MARKUP)
<span>goo</span><span>gle</span>
<a href="https://www.yandex.ru"><span>yan</span><span>dex</span></a>
MARKUP
end
end

context 'link domain is parent domain of whitelisted' do
subject { StringTools::HTML.remove_links(html, whitelist: ['www.yandex.ru']) }

let(:html) do
<<-MARKUP
<a href="https://google.com"><span>goo</span><span>gle</span></a>
<a href="https://yandex.ru"><span>yan</span><span>dex</span></a>
MARKUP
end

it 'should remove link' do
is_expected.to eq(<<-MARKUP)
<span>goo</span><span>gle</span>
<span>yan</span><span>dex</span>
MARKUP
end
end
end
end
end
3 changes: 2 additions & 1 deletion string_tools.gemspec
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,14 @@ Gem::Specification.new do |spec|
spec.add_runtime_dependency 'addressable', '~> 2.3.2'
spec.add_runtime_dependency 'ru_propisju', '~> 2.1.4'
spec.add_runtime_dependency 'sanitize', '>= 3.1.2'
spec.add_runtime_dependency 'loofah', '>= 2.0.0'

spec.add_development_dependency 'bundler', '~> 1.7'
spec.add_development_dependency 'rake', '~> 10.0'
spec.add_development_dependency 'rspec', '>= 2.14.0'
spec.add_development_dependency 'rspec-rails', '>= 2.14.0'
spec.add_development_dependency 'rspec-given', '~> 3.5'
spec.add_development_dependency 'shoulda-matchers'
spec.add_development_dependency 'shoulda-matchers', '~> 2.0'
spec.add_development_dependency 'appraisal', '>= 1.0.2'
spec.add_development_dependency 'combustion', '>= 0.5.3'
spec.add_development_dependency 'simplecov', '>= 0.9'
Expand Down

0 comments on commit e48da9f

Please sign in to comment.