From e48da9ff1f92a50fe2ee8b05fee098e185c7e5ff Mon Sep 17 00:00:00 2001 From: Dmitry Bochkarev Date: Mon, 12 Oct 2015 22:38:30 +0500 Subject: [PATCH] =?UTF-8?q?feature:=20=D1=83=D0=B4=D0=B0=D0=BB=D0=B5=D0=BD?= =?UTF-8?q?=D0=B8=D0=B5=20=D1=81=D1=81=D1=8B=D0=BB=D0=BE=D0=BA=20=D0=B8?= =?UTF-8?q?=D0=B7=20=D1=82=D0=B5=D0=BA=D1=81=D1=82=D0=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit https://jira.railsc.ru/browse/SERVICES-597 --- .gitignore | 1 + Gemfile | 1 + lib/string_tools.rb | 2 + lib/string_tools/html.rb | 60 ++++++++++++++++++++++ spec/html_spec.rb | 104 +++++++++++++++++++++++++++++++++++++++ string_tools.gemspec | 3 +- 6 files changed, 170 insertions(+), 1 deletion(-) create mode 100644 lib/string_tools/html.rb create mode 100644 spec/html_spec.rb diff --git a/.gitignore b/.gitignore index 0cb6eeb..44b81d1 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,4 @@ /pkg/ /spec/reports/ /tmp/ +/gemfiles diff --git a/Gemfile b/Gemfile index 1c661d1..ef0e78a 100644 --- a/Gemfile +++ b/Gemfile @@ -2,3 +2,4 @@ source 'https://rubygems.org' # Specify your gem's dependencies in string_tools.gemspec gemspec + diff --git a/lib/string_tools.rb b/lib/string_tools.rb index 3d4f996..3c58d49 100644 --- a/lib/string_tools.rb +++ b/lib/string_tools.rb @@ -6,6 +6,8 @@ require 'string_tools/core_ext/string' module StringTools + autoload :HTML, 'string_tools/html' + module CharDet # Возвращает true если строка содержит допустимую # последовательность байтов для кодировки utf8 и false в обратном случае diff --git a/lib/string_tools/html.rb b/lib/string_tools/html.rb new file mode 100644 index 0000000..0865e3b --- /dev/null +++ b/lib/string_tools/html.rb @@ -0,0 +1,60 @@ +# coding: utf-8 +require 'loofah' +require 'uri' + +module StringTools + module HTML + # Public: Удаляет ссылки на неразрешенные домены + # + # html - String содержимое потенциально ненужных ссылок + # options - Hash + # :whitelist - Array of String разрешенныe домены + # + # Examples + # html = 'yandex' + # + # StringTools::HTML.remove_links(html, whitelist: ['google.com']) + # # => 'yandex' + # + # StringTools::HTML.remove_links(html, whitelist: ['yandex.ru']) + # # => 'yandex' + # + # StringTools::HTML.remove_links(html, whitelist: ['www.yandex.ru']) + # # => 'yandex' + # + # html = 'yandex' + # + # StringTools::HTML.remove_links(html, whitelist: ['www.yandex.ru']) + # # => 'yandex' + # + # Returns String without links to external resources + def self.remove_links(html, options = {}) + Loofah.fragment(html).scrub!(LinksRemoveScrubber.new(options)).to_s + end + + class LinksRemoveScrubber < Loofah::Scrubber + def initialize(options) + @whitelist = options.fetch(:whitelist) + end + + def scrub(node) + return unless node.name == 'a'.freeze + uri = URI.parse(node['href'.freeze]) + node.swap(node.children) unless whitelisted? uri.host + rescue URI::InvalidURIError => _ + node.swap(node.children) + end + + def whitelisted?(domain) + host_parts = domain.split('.'.freeze).reverse! + host = host_parts[0] # com, ru ... + 1.upto(host_parts.length - 1) do |i| + subdomain = host_parts[i] + host = "#{subdomain}.#{host}" + return true if @whitelist.include? host + end + false + end + end + end +end diff --git a/spec/html_spec.rb b/spec/html_spec.rb new file mode 100644 index 0000000..2047ec0 --- /dev/null +++ b/spec/html_spec.rb @@ -0,0 +1,104 @@ +# coding: utf-8 +require 'spec_helper' + +describe StringTools::HTML do + describe '.remove_external_links' do + context 'whitelist option empty' do + subject { StringTools::HTML.remove_links(html, whitelist: []) } + + context 'content without links' do + let(:html) { ' hello ' } + + it 'should return html as is' do + is_expected.to eq html + end + end + + context 'content with links' do + let(:html) do + <<-MARKUP + google + yandex + MARKUP + end + + it 'should return markup without links' do + is_expected.to eq(<<-MARKUP) + google + yandex + MARKUP + end + end + + context 'content with recursive markup' do + let(:html) do + <<-MARKUP + google + yandex + MARKUP + end + + it 'should return content without links' do + is_expected.to eq(<<-MARKUP) + google + yandex + MARKUP + end + end + end + + context 'when whitelist passed' do + subject { StringTools::HTML.remove_links(html, whitelist: ['yandex.ru']) } + + context 'domain link match to whitelisted' do + let(:html) do + <<-MARKUP + google + yandex + MARKUP + end + + it 'should keep only whitelisted links' do + is_expected.to eq(<<-MARKUP) + google + yandex + MARKUP + end + end + + context 'link domain is subdomain of whitelisted' do + let(:html) do + <<-MARKUP + google + yandex + MARKUP + end + + it 'should keep only whitelisted links' do + is_expected.to eq(<<-MARKUP) + google + yandex + MARKUP + end + end + + context 'link domain is parent domain of whitelisted' do + subject { StringTools::HTML.remove_links(html, whitelist: ['www.yandex.ru']) } + + let(:html) do + <<-MARKUP + google + yandex + MARKUP + end + + it 'should remove link' do + is_expected.to eq(<<-MARKUP) + google + yandex + MARKUP + end + end + end + end +end diff --git a/string_tools.gemspec b/string_tools.gemspec index d80ab31..2f63838 100644 --- a/string_tools.gemspec +++ b/string_tools.gemspec @@ -26,13 +26,14 @@ Gem::Specification.new do |spec| spec.add_runtime_dependency 'addressable', '~> 2.3.2' spec.add_runtime_dependency 'ru_propisju', '~> 2.1.4' spec.add_runtime_dependency 'sanitize', '>= 3.1.2' + spec.add_runtime_dependency 'loofah', '>= 2.0.0' spec.add_development_dependency 'bundler', '~> 1.7' spec.add_development_dependency 'rake', '~> 10.0' spec.add_development_dependency 'rspec', '>= 2.14.0' spec.add_development_dependency 'rspec-rails', '>= 2.14.0' spec.add_development_dependency 'rspec-given', '~> 3.5' - spec.add_development_dependency 'shoulda-matchers' + spec.add_development_dependency 'shoulda-matchers', '~> 2.0' spec.add_development_dependency 'appraisal', '>= 1.0.2' spec.add_development_dependency 'combustion', '>= 0.5.3' spec.add_development_dependency 'simplecov', '>= 0.9'