From e05076f181ce413fa658218551ac367a5372678f Mon Sep 17 00:00:00 2001 From: Dmitry Bochkarev Date: Fri, 23 Oct 2015 00:29:35 +0500 Subject: [PATCH] =?UTF-8?q?fix(html):=20=D0=BF=D0=BE=D0=B4=D0=B4=D0=B5?= =?UTF-8?q?=D1=80=D0=B6=D0=BA=D0=B0=20=D0=BE=D1=82=D0=BD=D0=BE=D1=81=D0=B8?= =?UTF-8?q?=D1=82=D0=B5=D0=BB=D1=8C=D0=BD=D1=8B=D1=85=20=D0=BF=D1=83=D1=82?= =?UTF-8?q?=D0=B5=D0=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit https://jira.railsc.ru/browse/SERVICES-594 --- lib/string_tools/html.rb | 16 ++++++++++------ spec/html_spec.rb | 24 +++++++++++++++++++++++- 2 files changed, 33 insertions(+), 7 deletions(-) diff --git a/lib/string_tools/html.rb b/lib/string_tools/html.rb index ff4b19d..d3d66b9 100644 --- a/lib/string_tools/html.rb +++ b/lib/string_tools/html.rb @@ -1,6 +1,6 @@ # coding: utf-8 require 'loofah' -require 'uri' +require 'addressable/uri' module StringTools module HTML @@ -44,16 +44,20 @@ def initialize(options) def scrub(node) return unless node.name == 'a'.freeze - uri = URI.parse(node['href'.freeze]) + href = node['href'] + return if href.blank? + uri = Addressable::URI.parse(href).normalize + return unless uri.host node.swap(node.children) unless whitelisted? uri.host - rescue URI::InvalidURIError => _ + rescue + # в любой непонятной ситуации просто удаляем ссылку node.swap(node.children) end def whitelisted?(domain) - host_parts = domain.split('.'.freeze).reverse! - host = host_parts[0] # com, ru ... - 1.upto(host_parts.length - 1) do |i| + host_parts = domain.split('.'.freeze) + host = host_parts[-1] # com, ru ... + (host_parts.length - 2).downto(0) do |i| subdomain = host_parts[i] host = "#{subdomain}.#{host}" return true if @whitelist.include? host diff --git a/spec/html_spec.rb b/spec/html_spec.rb index 2047ec0..9c94f8a 100644 --- a/spec/html_spec.rb +++ b/spec/html_spec.rb @@ -48,11 +48,14 @@ end context 'when whitelist passed' do - subject { StringTools::HTML.remove_links(html, whitelist: ['yandex.ru']) } + subject { StringTools::HTML.remove_links(html, whitelist: ['yandex.ru', 'pulscen.com.ua']) } context 'domain link match to whitelisted' do let(:html) do <<-MARKUP + firm.pulscen.com.ua + pulscen.com.ua + com.ua google yandex MARKUP @@ -60,6 +63,9 @@ it 'should keep only whitelisted links' do is_expected.to eq(<<-MARKUP) + firm.pulscen.com.ua + pulscen.com.ua + com.ua google yandex MARKUP @@ -99,6 +105,22 @@ MARKUP end end + + context 'content with relative links' do + let(:html) do + <<-MARKUP + google + yandex + MARKUP + end + + it 'should keep relative links' do + is_expected.to eq(<<-MARKUP) + google + yandex + MARKUP + end + end end end end