diff --git a/README.md b/README.md index edcc5b2..9fc67c1 100644 --- a/README.md +++ b/README.md @@ -38,10 +38,10 @@ from scrapy_selenium import SeleniumRequest yield SeleniumRequest(url, self.parse_result) ``` -The request will be handled by selenium, and the response will have an additional `meta` key, named `driver` containing the selenium driver with the request processed. +The request will be handled by selenium, and the request will have an additional `meta` key, named `driver` containing the selenium driver with the request processed. ```python def parse_result(self, response): - print(response.meta['driver'].title) + print(response.request.meta['driver'].title) ``` For more information about the available driver methods and attributes, refer to the [selenium python documentation](http://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.remote.webdriver) @@ -52,7 +52,7 @@ def parse_result(self, response): ``` ### Additional arguments -The `scrapy_selenium.SeleniumRequest` accept 3 additional arguments: +The `scrapy_selenium.SeleniumRequest` accept 4 additional arguments: #### `wait_time` / `wait_until` @@ -80,6 +80,15 @@ yield SeleniumRequest( def parse_result(self, response): with open('image.png', 'wb') as image_file: - image_file.write(response.meta['screenshot]) + image_file.write(response.meta['screenshot']) ``` +#### `script` +When used, selenium will execute custom JavaScript code. +```python +yield SeleniumRequest( + url, + self.parse_result, + script='window.scrollTo(0, document.body.scrollHeight);', +) +``` diff --git a/scrapy_selenium/http.py b/scrapy_selenium/http.py index fb1db34..cddf7bf 100644 --- a/scrapy_selenium/http.py +++ b/scrapy_selenium/http.py @@ -6,7 +6,7 @@ class SeleniumRequest(Request): """Scrapy ``Request`` subclass providing additional arguments""" - def __init__(self, wait_time=None, wait_until=None, screenshot=False, *args, **kwargs): + def __init__(self, wait_time=None, wait_until=None, screenshot=False, script=None, *args, **kwargs): """Initialize a new selenium request Parameters @@ -19,11 +19,14 @@ def __init__(self, wait_time=None, wait_until=None, screenshot=False, *args, **k screenshot: bool If True, a screenshot of the page will be taken and the data of the screenshot will be returned in the response "meta" attribute. + script: str + JavaScript code to execute. """ self.wait_time = wait_time self.wait_until = wait_until self.screenshot = screenshot + self.script = script super().__init__(*args, **kwargs) diff --git a/scrapy_selenium/middlewares.py b/scrapy_selenium/middlewares.py index 1a4d8a8..3761ca5 100644 --- a/scrapy_selenium/middlewares.py +++ b/scrapy_selenium/middlewares.py @@ -99,6 +99,9 @@ def process_request(self, request, spider): if request.screenshot: request.meta['screenshot'] = self.driver.get_screenshot_as_png() + if request.script: + self.driver.execute_script(request.script) + body = str.encode(self.driver.page_source) # Expose the driver via the "meta" attribute diff --git a/setup.cfg b/setup.cfg index 220464a..2ca31e9 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = scrapy-selenium -version = 0.0.6 +version = 0.0.7 url = https://github.com/clemfromspace/scrapy-selenium licence = DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE description = Scrapy with selenium diff --git a/tests/test_middlewares.py b/tests/test_middlewares.py index 064f81a..fe365e4 100644 --- a/tests/test_middlewares.py +++ b/tests/test_middlewares.py @@ -117,3 +117,21 @@ def test_process_request_should_return_a_screenshot_if_screenshot_option(self): ) self.assertIsNotNone(html_response.meta['screenshot']) + + def test_process_request_should_execute_script_if_script_option(self): + """Test that the ``process_request`` should execute the script and return a response""" + + selenium_request = SeleniumRequest( + url='http://www.python.org', + script='document.title = "scrapy_selenium";' + ) + + html_response = self.selenium_middleware.process_request( + request=selenium_request, + spider=None + ) + + self.assertEqual( + html_response.selector.xpath('//title/text()').extract_first(), + 'scrapy_selenium' + )