Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add SplashHtmlResponse (extended #115) #160

Open
wants to merge 15 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion scrapy_splash/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,7 @@
)
from .dupefilter import SplashAwareDupeFilter, splash_request_fingerprint
from .cache import SplashAwareFSCacheStorage
from .response import SplashResponse, SplashTextResponse, SplashJsonResponse
from .response import (
SplashResponse, SplashTextResponse, SplashHtmlResponse, SplashJsonResponse,
)
from .request import SplashRequest, SplashFormRequest
16 changes: 11 additions & 5 deletions scrapy_splash/middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from scrapy.exceptions import NotConfigured
from scrapy.http.headers import Headers
from scrapy.http.response.text import TextResponse
from scrapy.http.response.html import HtmlResponse
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don’t see where this import is used in your changes.

from scrapy import signals

from scrapy_splash.responsetypes import responsetypes
Expand Down Expand Up @@ -397,19 +398,24 @@ def process_response(self, request, response, spider):
return response

def _change_response_class(self, request, response):
from scrapy_splash import SplashResponse, SplashTextResponse
if not isinstance(response, (SplashResponse, SplashTextResponse)):
from scrapy_splash import SplashResponse
from scrapy_splash.response import splash_scrapy_text_responses
splash_text_response_types = tuple(x for x, y in splash_scrapy_text_responses)

if not isinstance(response, (SplashResponse, splash_text_response_types)):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Your new SplashHtmlResponse class inherits from SplashTextResponse. Doesn’t that make these changes and some changes below unnecessary?

# create a custom Response subclass based on response Content-Type
# XXX: usually request is assigned to response only when all
# downloader middlewares are executed. Here it is set earlier.
# Does it have any negative consequences?
respcls = responsetypes.from_args(headers=response.headers)
if isinstance(response, TextResponse) and respcls is SplashResponse:
for splash_cls, scrapy_cls in splash_scrapy_text_responses:
# Even if the headers say it's binary, it has already
# been detected as a text response by scrapy (for example
# been detected as a text/html response by scrapy (for example
# because it was decoded successfully), so we should not
# convert it to SplashResponse.
respcls = SplashTextResponse
if isinstance(response, scrapy_cls) and respcls is SplashResponse:
respcls = splash_cls
break
response = response.replace(cls=respcls, request=request)
return response

Expand Down
17 changes: 16 additions & 1 deletion scrapy_splash/response.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import base64
import re

from scrapy.http import Response, TextResponse
from scrapy.http import Response, TextResponse, HtmlResponse
from scrapy import Selector

from scrapy_splash.utils import headers_to_scrapy
Expand Down Expand Up @@ -87,6 +87,15 @@ def replace(self, *args, **kwargs):
return _SplashResponseMixin.replace(self, *args, **kwargs)


class SplashHtmlResponse(SplashTextResponse, HtmlResponse):
"""
This HtmlResponse subclass sets response.url to the URL of a remote website
instead of an URL of Splash server. "Real" response URL is still available
as ``response.real_url``.
"""
pass


class SplashJsonResponse(SplashResponse):
"""
Splash Response with JSON data. It provides a convenient way to access
Expand Down Expand Up @@ -185,3 +194,9 @@ def _load_from_json(self):
# response.headers
if 'headers' in self.data:
self.headers = headers_to_scrapy(self.data['headers'])


splash_scrapy_text_responses = (
(SplashTextResponse, TextResponse),
(SplashHtmlResponse, HtmlResponse),
)
6 changes: 3 additions & 3 deletions scrapy_splash/responsetypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,12 @@

class SplashResponseTypes(ResponseTypes):
CLASSES = {
'text/html': 'scrapy_splash.response.SplashTextResponse',
'text/html': 'scrapy_splash.response.SplashHtmlResponse',
'application/atom+xml': 'scrapy_splash.response.SplashTextResponse',
'application/rdf+xml': 'scrapy_splash.response.SplashTextResponse',
'application/rss+xml': 'scrapy_splash.response.SplashTextResponse',
'application/xhtml+xml': 'scrapy_splash.response.SplashTextResponse',
'application/vnd.wap.xhtml+xml': 'scrapy_splash.response.SplashTextResponse',
'application/xhtml+xml': 'scrapy_splash.response.SplashHtmlResponse',
'application/vnd.wap.xhtml+xml': 'scrapy_splash.response.SplashHtmlResponse',
'application/xml': 'scrapy_splash.response.SplashTextResponse',
'application/json': 'scrapy_splash.response.SplashJsonResponse',
'application/x-json': 'scrapy_splash.response.SplashJsonResponse',
Expand Down
29 changes: 21 additions & 8 deletions tests/test_middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,12 @@
import copy
import json
import base64
import pytest

import scrapy
from scrapy.core.engine import ExecutionEngine
from scrapy.utils.test import get_crawler
from scrapy.http import Response, TextResponse
from scrapy.http import Response, TextResponse, HtmlResponse
from scrapy.downloadermiddlewares.httpcache import HttpCacheMiddleware

import scrapy_splash
Expand All @@ -18,6 +19,13 @@
SlotPolicy,
SplashCookiesMiddleware,
SplashDeduplicateArgsMiddleware,
SplashHtmlResponse,
SplashTextResponse,
)

splash_scrapy_content_types = (
(SplashTextResponse, TextResponse, b'text/*'),
(SplashHtmlResponse, HtmlResponse, b'text/html'),
Copy link
Contributor

@Gallaecio Gallaecio Apr 5, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💄 Since these lines are not too long, and this variable is only used once, maybe the variable could be removed and its content moved below, inline?

)


Expand Down Expand Up @@ -60,7 +68,11 @@ def test_nosplash():
assert response3.url == "http://example.com"


def test_splash_request():
@pytest.mark.parametrize(
'splash_response_type, scrapy_response_type, content_type',
splash_scrapy_content_types,
)
def test_splash_request(splash_response_type, scrapy_response_type, content_type):
mw = _get_mw()
cookie_mw = _get_cookie_mw()

Expand All @@ -82,25 +94,26 @@ def test_splash_request():
assert json.loads(to_native_str(req2.body)) == expected_body

# check response post-processing
response = TextResponse("http://127.0.0.1:8050/render.html",
response = scrapy_response_type(
url="http://127.0.0.1:8050/render.html",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💄 Too much indentation, I would say.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

agree

# Scrapy doesn't pass request to constructor
# request=req2,
headers={b'Content-Type': b'text/html'},
body=b"<html><body>Hello</body></html>")
headers={b'Content-Type': content_type},
body=b"<html><body>Hello</body></html>", )
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💄 The added comma is not necessary.

It makes sense to add commas at the end if the closing parenthesis is on a different line, that allows to keep a clean diff when adding new parameters. But if you keep the closing parenthesis on the same line, there is no benefit to it.

response2 = mw.process_response(req2, response, None)
response2 = cookie_mw.process_response(req2, response2, None)
assert isinstance(response2, scrapy_splash.SplashTextResponse)
assert isinstance(response2, splash_response_type)
assert response2 is not response
assert response2.real_url == req2.url
assert response2.url == req.url
assert response2.body == b"<html><body>Hello</body></html>"
assert response2.css("body").extract_first() == "<body>Hello</body>"
assert response2.headers == {b'Content-Type': [b'text/html']}
assert response2.headers == {b'Content-Type': [content_type, ]}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💄 That comma is only needed for tuples, not lists.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

idk, it makes it looking more like a list rather an object.


# check .replace method
response3 = response2.replace(status=404)
assert response3.status == 404
assert isinstance(response3, scrapy_splash.SplashTextResponse)
assert isinstance(response3, splash_response_type)
for attr in ['url', 'real_url', 'headers', 'body']:
assert getattr(response3, attr) == getattr(response2, attr)

Expand Down
10 changes: 10 additions & 0 deletions tests/test_response.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from scrapy.http import HtmlResponse, TextResponse, Response
from scrapy_splash.response import (
SplashTextResponse, SplashHtmlResponse, SplashResponse,
)


def test_response_types():
assert issubclass(SplashResponse, Response)
assert issubclass(SplashTextResponse, TextResponse)
assert issubclass(SplashHtmlResponse, HtmlResponse)