scrapy-plugins · iAnanich · Mar 27, 2017 · Mar 27, 2017 · Mar 27, 2017 · Mar 27, 2017
diff --git a/scrapy_splash/__init__.py b/scrapy_splash/__init__.py
@@ -9,5 +9,7 @@
 )
 from .dupefilter import SplashAwareDupeFilter, splash_request_fingerprint
 from .cache import SplashAwareFSCacheStorage
-from .response import SplashResponse, SplashTextResponse, SplashJsonResponse
+from .response import (
+    SplashResponse, SplashTextResponse, SplashHtmlResponse, SplashJsonResponse,
+)
 from .request import SplashRequest, SplashFormRequest
diff --git a/scrapy_splash/middleware.py b/scrapy_splash/middleware.py
@@ -14,6 +14,7 @@
 from scrapy.exceptions import NotConfigured
 from scrapy.http.headers import Headers
 from scrapy.http.response.text import TextResponse
+from scrapy.http.response.html import HtmlResponse
 from scrapy import signals
 
 from scrapy_splash.responsetypes import responsetypes
@@ -397,19 +398,24 @@ def process_response(self, request, response, spider):
         return response
 
     def _change_response_class(self, request, response):
-        from scrapy_splash import SplashResponse, SplashTextResponse
-        if not isinstance(response, (SplashResponse, SplashTextResponse)):
+        from scrapy_splash import SplashResponse
+        from scrapy_splash.response import splash_scrapy_text_responses
+        splash_text_response_types = tuple(x for x, y in splash_scrapy_text_responses)
+
+        if not isinstance(response, (SplashResponse, splash_text_response_types)):
             # create a custom Response subclass based on response Content-Type
             # XXX: usually request is assigned to response only when all
             # downloader middlewares are executed. Here it is set earlier.
             # Does it have any negative consequences?
             respcls = responsetypes.from_args(headers=response.headers)
-            if isinstance(response, TextResponse) and respcls is SplashResponse:
+            for splash_cls, scrapy_cls in splash_scrapy_text_responses:
                 # Even if the headers say it's binary, it has already
-                # been detected as a text response by scrapy (for example
+                # been detected as a text/html response by scrapy (for example
                 # because it was decoded successfully), so we should not
                 # convert it to SplashResponse.
-                respcls = SplashTextResponse
+                if isinstance(response, scrapy_cls) and respcls is SplashResponse:
+                    respcls = splash_cls
+                    break
             response = response.replace(cls=respcls, request=request)
         return response
 

diff --git a/scrapy_splash/response.py b/scrapy_splash/response.py
@@ -5,7 +5,7 @@
 import base64
 import re
 
-from scrapy.http import Response, TextResponse
+from scrapy.http import Response, TextResponse, HtmlResponse
 from scrapy import Selector
 
 from scrapy_splash.utils import headers_to_scrapy
@@ -87,6 +87,15 @@ def replace(self, *args, **kwargs):
         return _SplashResponseMixin.replace(self, *args, **kwargs)
 
 
+class SplashHtmlResponse(SplashTextResponse, HtmlResponse):
+    """
+    This HtmlResponse subclass sets response.url to the URL of a remote website
+    instead of an URL of Splash server. "Real" response URL is still available
+    as ``response.real_url``.
+    """
+    pass
+
+
 class SplashJsonResponse(SplashResponse):
     """
     Splash Response with JSON data. It provides a convenient way to access
@@ -185,3 +194,9 @@ def _load_from_json(self):
         # response.headers
         if 'headers' in self.data:
             self.headers = headers_to_scrapy(self.data['headers'])
+
+
+splash_scrapy_text_responses = (
+    (SplashTextResponse, TextResponse),
+    (SplashHtmlResponse, HtmlResponse),
+)
diff --git a/scrapy_splash/responsetypes.py b/scrapy_splash/responsetypes.py
@@ -9,12 +9,12 @@
 
 class SplashResponseTypes(ResponseTypes):
     CLASSES = {
-        'text/html': 'scrapy_splash.response.SplashTextResponse',
+        'text/html': 'scrapy_splash.response.SplashHtmlResponse',
         'application/atom+xml': 'scrapy_splash.response.SplashTextResponse',
         'application/rdf+xml': 'scrapy_splash.response.SplashTextResponse',
         'application/rss+xml': 'scrapy_splash.response.SplashTextResponse',
-        'application/xhtml+xml': 'scrapy_splash.response.SplashTextResponse',
-        'application/vnd.wap.xhtml+xml': 'scrapy_splash.response.SplashTextResponse',
+        'application/xhtml+xml': 'scrapy_splash.response.SplashHtmlResponse',
+        'application/vnd.wap.xhtml+xml': 'scrapy_splash.response.SplashHtmlResponse',
         'application/xml': 'scrapy_splash.response.SplashTextResponse',
         'application/json': 'scrapy_splash.response.SplashJsonResponse',
         'application/x-json': 'scrapy_splash.response.SplashJsonResponse',

diff --git a/tests/test_middleware.py b/tests/test_middleware.py
@@ -3,11 +3,12 @@
 import copy
 import json
 import base64
+import pytest
 
 import scrapy
 from scrapy.core.engine import ExecutionEngine
 from scrapy.utils.test import get_crawler
-from scrapy.http import Response, TextResponse
+from scrapy.http import Response, TextResponse, HtmlResponse
 from scrapy.downloadermiddlewares.httpcache import HttpCacheMiddleware
 
 import scrapy_splash
@@ -18,6 +19,13 @@
     SlotPolicy,
     SplashCookiesMiddleware,
     SplashDeduplicateArgsMiddleware,
+    SplashHtmlResponse,
+    SplashTextResponse,
+)
+
+splash_scrapy_content_types = (
+    (SplashTextResponse, TextResponse, b'text/*'),
+    (SplashHtmlResponse, HtmlResponse, b'text/html'),
 )
 
 
@@ -60,7 +68,11 @@ def test_nosplash():
     assert response3.url == "http://example.com"
 
 
-def test_splash_request():
+@pytest.mark.parametrize(
+    'splash_response_type, scrapy_response_type, content_type',
+    splash_scrapy_content_types,
+)
+def test_splash_request(splash_response_type, scrapy_response_type, content_type):
     mw = _get_mw()
     cookie_mw = _get_cookie_mw()
 
@@ -82,25 +94,26 @@ def test_splash_request():
     assert json.loads(to_native_str(req2.body)) == expected_body
 
     # check response post-processing
-    response = TextResponse("http://127.0.0.1:8050/render.html",
+    response = scrapy_response_type(
+                            url="http://127.0.0.1:8050/render.html",
                             # Scrapy doesn't pass request to constructor
                             # request=req2,
-                            headers={b'Content-Type': b'text/html'},
-                            body=b"<html><body>Hello</body></html>")
+                            headers={b'Content-Type': content_type},
+                            body=b"<html><body>Hello</body></html>", )
     response2 = mw.process_response(req2, response, None)
     response2 = cookie_mw.process_response(req2, response2, None)
-    assert isinstance(response2, scrapy_splash.SplashTextResponse)
+    assert isinstance(response2, splash_response_type)
     assert response2 is not response
     assert response2.real_url == req2.url
     assert response2.url == req.url
     assert response2.body == b"<html><body>Hello</body></html>"
     assert response2.css("body").extract_first() == "<body>Hello</body>"
-    assert response2.headers == {b'Content-Type': [b'text/html']}
+    assert response2.headers == {b'Content-Type': [content_type, ]}
 
     # check .replace method
     response3 = response2.replace(status=404)
     assert response3.status == 404
-    assert isinstance(response3, scrapy_splash.SplashTextResponse)
+    assert isinstance(response3, splash_response_type)
     for attr in ['url', 'real_url', 'headers', 'body']:
         assert getattr(response3, attr) == getattr(response2, attr)
 

diff --git a/tests/test_response.py b/tests/test_response.py
@@ -0,0 +1,10 @@
+from scrapy.http import HtmlResponse, TextResponse, Response
+from scrapy_splash.response import (
+    SplashTextResponse, SplashHtmlResponse, SplashResponse,
+)
+
+
+def test_response_types():
+    assert issubclass(SplashResponse, Response)
+    assert issubclass(SplashTextResponse, TextResponse)
+    assert issubclass(SplashHtmlResponse, HtmlResponse)