-
Notifications
You must be signed in to change notification settings - Fork 453
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
add SplashHtmlResponse (extended #115) #160
base: master
Are you sure you want to change the base?
Changes from all commits
c140615
d4c280a
9ff12e4
67edaae
5a59297
7b1cbcf
d951b56
d6584ec
4d0c305
b2be0d5
692f590
268cbc1
f16d6d3
1815f51
6340dad
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,6 +14,7 @@ | |
from scrapy.exceptions import NotConfigured | ||
from scrapy.http.headers import Headers | ||
from scrapy.http.response.text import TextResponse | ||
from scrapy.http.response.html import HtmlResponse | ||
from scrapy import signals | ||
|
||
from scrapy_splash.responsetypes import responsetypes | ||
|
@@ -397,19 +398,24 @@ def process_response(self, request, response, spider): | |
return response | ||
|
||
def _change_response_class(self, request, response): | ||
from scrapy_splash import SplashResponse, SplashTextResponse | ||
if not isinstance(response, (SplashResponse, SplashTextResponse)): | ||
from scrapy_splash import SplashResponse | ||
from scrapy_splash.response import splash_scrapy_text_responses | ||
splash_text_response_types = tuple(x for x, y in splash_scrapy_text_responses) | ||
|
||
if not isinstance(response, (SplashResponse, splash_text_response_types)): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Your new |
||
# create a custom Response subclass based on response Content-Type | ||
# XXX: usually request is assigned to response only when all | ||
# downloader middlewares are executed. Here it is set earlier. | ||
# Does it have any negative consequences? | ||
respcls = responsetypes.from_args(headers=response.headers) | ||
if isinstance(response, TextResponse) and respcls is SplashResponse: | ||
for splash_cls, scrapy_cls in splash_scrapy_text_responses: | ||
# Even if the headers say it's binary, it has already | ||
# been detected as a text response by scrapy (for example | ||
# been detected as a text/html response by scrapy (for example | ||
# because it was decoded successfully), so we should not | ||
# convert it to SplashResponse. | ||
respcls = SplashTextResponse | ||
if isinstance(response, scrapy_cls) and respcls is SplashResponse: | ||
respcls = splash_cls | ||
break | ||
response = response.replace(cls=respcls, request=request) | ||
return response | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,11 +3,12 @@ | |
import copy | ||
import json | ||
import base64 | ||
import pytest | ||
|
||
import scrapy | ||
from scrapy.core.engine import ExecutionEngine | ||
from scrapy.utils.test import get_crawler | ||
from scrapy.http import Response, TextResponse | ||
from scrapy.http import Response, TextResponse, HtmlResponse | ||
from scrapy.downloadermiddlewares.httpcache import HttpCacheMiddleware | ||
|
||
import scrapy_splash | ||
|
@@ -18,6 +19,13 @@ | |
SlotPolicy, | ||
SplashCookiesMiddleware, | ||
SplashDeduplicateArgsMiddleware, | ||
SplashHtmlResponse, | ||
SplashTextResponse, | ||
) | ||
|
||
splash_scrapy_content_types = ( | ||
(SplashTextResponse, TextResponse, b'text/*'), | ||
(SplashHtmlResponse, HtmlResponse, b'text/html'), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
) | ||
|
||
|
||
|
@@ -60,7 +68,11 @@ def test_nosplash(): | |
assert response3.url == "http://example.com" | ||
|
||
|
||
def test_splash_request(): | ||
@pytest.mark.parametrize( | ||
'splash_response_type, scrapy_response_type, content_type', | ||
splash_scrapy_content_types, | ||
) | ||
def test_splash_request(splash_response_type, scrapy_response_type, content_type): | ||
mw = _get_mw() | ||
cookie_mw = _get_cookie_mw() | ||
|
||
|
@@ -82,25 +94,26 @@ def test_splash_request(): | |
assert json.loads(to_native_str(req2.body)) == expected_body | ||
|
||
# check response post-processing | ||
response = TextResponse("http://127.0.0.1:8050/render.html", | ||
response = scrapy_response_type( | ||
url="http://127.0.0.1:8050/render.html", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 💄 Too much indentation, I would say. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. agree |
||
# Scrapy doesn't pass request to constructor | ||
# request=req2, | ||
headers={b'Content-Type': b'text/html'}, | ||
body=b"<html><body>Hello</body></html>") | ||
headers={b'Content-Type': content_type}, | ||
body=b"<html><body>Hello</body></html>", ) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 💄 The added comma is not necessary. It makes sense to add commas at the end if the closing parenthesis is on a different line, that allows to keep a clean diff when adding new parameters. But if you keep the closing parenthesis on the same line, there is no benefit to it. |
||
response2 = mw.process_response(req2, response, None) | ||
response2 = cookie_mw.process_response(req2, response2, None) | ||
assert isinstance(response2, scrapy_splash.SplashTextResponse) | ||
assert isinstance(response2, splash_response_type) | ||
assert response2 is not response | ||
assert response2.real_url == req2.url | ||
assert response2.url == req.url | ||
assert response2.body == b"<html><body>Hello</body></html>" | ||
assert response2.css("body").extract_first() == "<body>Hello</body>" | ||
assert response2.headers == {b'Content-Type': [b'text/html']} | ||
assert response2.headers == {b'Content-Type': [content_type, ]} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 💄 That comma is only needed for tuples, not lists. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. idk, it makes it looking more like a list rather an object. |
||
|
||
# check .replace method | ||
response3 = response2.replace(status=404) | ||
assert response3.status == 404 | ||
assert isinstance(response3, scrapy_splash.SplashTextResponse) | ||
assert isinstance(response3, splash_response_type) | ||
for attr in ['url', 'real_url', 'headers', 'body']: | ||
assert getattr(response3, attr) == getattr(response2, attr) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
from scrapy.http import HtmlResponse, TextResponse, Response | ||
from scrapy_splash.response import ( | ||
SplashTextResponse, SplashHtmlResponse, SplashResponse, | ||
) | ||
|
||
|
||
def test_response_types(): | ||
assert issubclass(SplashResponse, Response) | ||
assert issubclass(SplashTextResponse, TextResponse) | ||
assert issubclass(SplashHtmlResponse, HtmlResponse) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don’t see where this import is used in your changes.