From 7c59058d273940b46523676f22cd4f3aac219a07 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Thu, 15 Jun 2023 17:40:22 +0200 Subject: [PATCH] Cache provider output per request --- scrapy_zyte_api/providers.py | 43 ++++++++++++++----- tests/conftest.py | 8 ++++ tests/test_providers.py | 82 +++++++++++++++++++++++++++++++----- 3 files changed, 112 insertions(+), 21 deletions(-) diff --git a/scrapy_zyte_api/providers.py b/scrapy_zyte_api/providers.py index 47c5ea01..26359a85 100644 --- a/scrapy_zyte_api/providers.py +++ b/scrapy_zyte_api/providers.py @@ -1,4 +1,5 @@ -from typing import Any, Callable, List, Sequence, Set +from typing import Any, Callable, Dict, List, Sequence, Set, Type +from weakref import WeakKeyDictionary from scrapy import Request from scrapy.crawler import Crawler @@ -21,11 +22,29 @@ class ZyteApiProvider(PageObjectInputProvider): provided_classes = {BrowserResponse, BrowserHtml, Product} + def __init__(self, injector): + super().__init__(injector) + self._cached_instances: WeakKeyDictionary[Request, Dict] = WeakKeyDictionary() + + def update_cache(self, request: Request, mapping: Dict[Type, Any]) -> None: + if request not in self._cached_instances: + self._cached_instances[request] = {} + self._cached_instances[request].update(mapping) + async def __call__( self, to_provide: Set[Callable], request: Request, crawler: Crawler ) -> Sequence[Any]: """Makes a Zyte API request to provide BrowserResponse and/or item dependencies.""" # TODO what if ``response`` is already from Zyte API and contains something we need + results: List[Any] = [] + + for cls in list(to_provide): + item = self._cached_instances.get(request, {}).get(cls) + if item: + results.append(item) + to_provide.remove(cls) + if not to_provide: + return results html_requested = BrowserResponse in to_provide or BrowserHtml in to_provide item_keywords = {Product: "product"} @@ -36,7 +55,7 @@ async def __call__( for item_type, kw in item_keywords.items(): if item_type in to_provide: zyte_api_meta[kw] = True - request = Request( + api_request = Request( url=request.url, meta={ "zyte_api": zyte_api_meta, @@ -45,26 +64,28 @@ async def __call__( callback=NO_CALLBACK, ) api_response: ZyteAPITextResponse = await maybe_deferred_to_future( - crawler.engine.download(request) + crawler.engine.download(api_request) ) assert api_response.raw_api_response - results: List[Any] = [] if html_requested: html = BrowserHtml(api_response.raw_api_response["browserHtml"]) else: html = None if BrowserHtml in to_provide: results.append(html) + self.update_cache(request, {BrowserHtml: html}) if BrowserResponse in to_provide: - results.append( - BrowserResponse( - url=api_response.url, - status=api_response.status, - html=html, - ) + response = BrowserResponse( + url=api_response.url, + status=api_response.status, + html=html, ) + results.append(response) + self.update_cache(request, {BrowserResponse: response}) for item_type, kw in item_keywords.items(): if item_type in to_provide: - results.append(item_type.from_dict(api_response.raw_api_response[kw])) + item = item_type.from_dict(api_response.raw_api_response[kw]) + results.append(item) + self.update_cache(request, {item_type: item}) return results diff --git a/tests/conftest.py b/tests/conftest.py index db2b302e..6acd1c42 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -7,3 +7,11 @@ def mockserver(): with MockServer() as server: yield server + + +@pytest.fixture(scope="function") +def fresh_mockserver(): + from .mockserver import MockServer + + with MockServer() as server: + yield server diff --git a/tests/test_providers.py b/tests/test_providers.py index 2e0611b8..0c8fa538 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -73,32 +73,94 @@ def url(self) -> str: return str(self.response.url) -class ItemDepSpider(ZyteAPISpider): - def parse_(self, response: DummyResponse, product: Product, my_item: MyItem): # type: ignore[override] - yield { - "product": product, - "my_item": my_item, - } +@ensureDeferred +async def test_itemprovider_requests_direct_dependencies(fresh_mockserver): + class ItemDepSpider(ZyteAPISpider): + def parse_( # type: ignore[override] + self, + response: DummyResponse, + browser_response: BrowserResponse, + product: Product, + ): + yield { + "product": product, + "browser_response": browser_response, + } + + port = get_ephemeral_port() + handle_urls(f"{fresh_mockserver.host}:{port}")(MyPage) + + settings = create_scrapy_settings(None) + settings.update(SETTINGS) + settings["ZYTE_API_URL"] = fresh_mockserver.urljoin("/") + settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 1100} + item, url, _ = await crawl_single_item( + ItemDepSpider, HtmlResource, settings, port=port + ) + count_resp = await Agent(reactor).request( + b"GET", fresh_mockserver.urljoin("/count").encode() + ) + call_count = int((await readBody(count_resp)).decode()) + assert call_count == 1 + assert "browser_response" in item + assert "product" in item # https://github.com/scrapy-plugins/scrapy-zyte-api/issues/91 @pytest.mark.xfail(reason="Not implemented yet", raises=AssertionError, strict=True) @ensureDeferred -async def test_itemprovider_requests(mockserver): +async def test_itemprovider_requests_indirect_dependencies(fresh_mockserver): + class ItemDepSpider(ZyteAPISpider): + def parse_(self, response: DummyResponse, product: Product, my_item: MyItem): # type: ignore[override] + yield { + "product": product, + "my_item": my_item, + } + port = get_ephemeral_port() - handle_urls(f"{mockserver.host}:{port}")(MyPage) + handle_urls(f"{fresh_mockserver.host}:{port}")(MyPage) settings = create_scrapy_settings(None) settings.update(SETTINGS) - settings["ZYTE_API_URL"] = mockserver.urljoin("/") + settings["ZYTE_API_URL"] = fresh_mockserver.urljoin("/") settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 1100} item, url, _ = await crawl_single_item( ItemDepSpider, HtmlResource, settings, port=port ) count_resp = await Agent(reactor).request( - b"GET", mockserver.urljoin("/count").encode() + b"GET", fresh_mockserver.urljoin("/count").encode() + ) + call_count = int((await readBody(count_resp)).decode()) + assert call_count == 1 + assert "my_item" in item + assert "product" in item + + +@ensureDeferred +async def test_itemprovider_requests_indirect_dependencies_workaround(fresh_mockserver): + class ItemDepSpider(ZyteAPISpider): + def parse_(self, response: DummyResponse, product: Product, browser_response: BrowserResponse, my_item: MyItem): # type: ignore[override] + yield { + "product": product, + "my_item": my_item, + "browser_response": browser_response, + } + + port = get_ephemeral_port() + handle_urls(f"{fresh_mockserver.host}:{port}")(MyPage) + + settings = create_scrapy_settings(None) + settings.update(SETTINGS) + settings["ZYTE_API_URL"] = fresh_mockserver.urljoin("/") + settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 1} + item, url, _ = await crawl_single_item( + ItemDepSpider, HtmlResource, settings, port=port + ) + count_resp = await Agent(reactor).request( + b"GET", fresh_mockserver.urljoin("/count").encode() ) call_count = int((await readBody(count_resp)).decode()) assert call_count == 1 assert "my_item" in item assert "product" in item + assert "browser_response" in item