Skip to content

Commit

Permalink
Merge pull request #94 from Gallaecio/provider-cache
Browse files Browse the repository at this point in the history
Cache provider output per request
  • Loading branch information
wRAR authored Jun 16, 2023
2 parents 3e03e87 + 7c59058 commit 3c82d07
Show file tree
Hide file tree
Showing 3 changed files with 112 additions and 21 deletions.
43 changes: 32 additions & 11 deletions scrapy_zyte_api/providers.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from typing import Any, Callable, List, Sequence, Set
from typing import Any, Callable, Dict, List, Sequence, Set, Type
from weakref import WeakKeyDictionary

from scrapy import Request
from scrapy.crawler import Crawler
Expand All @@ -21,11 +22,29 @@ class ZyteApiProvider(PageObjectInputProvider):

provided_classes = {BrowserResponse, BrowserHtml, Product}

def __init__(self, injector):
super().__init__(injector)
self._cached_instances: WeakKeyDictionary[Request, Dict] = WeakKeyDictionary()

def update_cache(self, request: Request, mapping: Dict[Type, Any]) -> None:
if request not in self._cached_instances:
self._cached_instances[request] = {}
self._cached_instances[request].update(mapping)

async def __call__(
self, to_provide: Set[Callable], request: Request, crawler: Crawler
) -> Sequence[Any]:
"""Makes a Zyte API request to provide BrowserResponse and/or item dependencies."""
# TODO what if ``response`` is already from Zyte API and contains something we need
results: List[Any] = []

for cls in list(to_provide):
item = self._cached_instances.get(request, {}).get(cls)
if item:
results.append(item)
to_provide.remove(cls)
if not to_provide:
return results

html_requested = BrowserResponse in to_provide or BrowserHtml in to_provide
item_keywords = {Product: "product"}
Expand All @@ -36,7 +55,7 @@ async def __call__(
for item_type, kw in item_keywords.items():
if item_type in to_provide:
zyte_api_meta[kw] = True
request = Request(
api_request = Request(
url=request.url,
meta={
"zyte_api": zyte_api_meta,
Expand All @@ -45,26 +64,28 @@ async def __call__(
callback=NO_CALLBACK,
)
api_response: ZyteAPITextResponse = await maybe_deferred_to_future(
crawler.engine.download(request)
crawler.engine.download(api_request)
)

assert api_response.raw_api_response
results: List[Any] = []
if html_requested:
html = BrowserHtml(api_response.raw_api_response["browserHtml"])
else:
html = None
if BrowserHtml in to_provide:
results.append(html)
self.update_cache(request, {BrowserHtml: html})
if BrowserResponse in to_provide:
results.append(
BrowserResponse(
url=api_response.url,
status=api_response.status,
html=html,
)
response = BrowserResponse(
url=api_response.url,
status=api_response.status,
html=html,
)
results.append(response)
self.update_cache(request, {BrowserResponse: response})
for item_type, kw in item_keywords.items():
if item_type in to_provide:
results.append(item_type.from_dict(api_response.raw_api_response[kw]))
item = item_type.from_dict(api_response.raw_api_response[kw])
results.append(item)
self.update_cache(request, {item_type: item})
return results
8 changes: 8 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,11 @@ def mockserver():

with MockServer() as server:
yield server


@pytest.fixture(scope="function")
def fresh_mockserver():
from .mockserver import MockServer

with MockServer() as server:
yield server
82 changes: 72 additions & 10 deletions tests/test_providers.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,32 +73,94 @@ def url(self) -> str:
return str(self.response.url)


class ItemDepSpider(ZyteAPISpider):
def parse_(self, response: DummyResponse, product: Product, my_item: MyItem): # type: ignore[override]
yield {
"product": product,
"my_item": my_item,
}
@ensureDeferred
async def test_itemprovider_requests_direct_dependencies(fresh_mockserver):
class ItemDepSpider(ZyteAPISpider):
def parse_( # type: ignore[override]
self,
response: DummyResponse,
browser_response: BrowserResponse,
product: Product,
):
yield {
"product": product,
"browser_response": browser_response,
}

port = get_ephemeral_port()
handle_urls(f"{fresh_mockserver.host}:{port}")(MyPage)

settings = create_scrapy_settings(None)
settings.update(SETTINGS)
settings["ZYTE_API_URL"] = fresh_mockserver.urljoin("/")
settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 1100}
item, url, _ = await crawl_single_item(
ItemDepSpider, HtmlResource, settings, port=port
)
count_resp = await Agent(reactor).request(
b"GET", fresh_mockserver.urljoin("/count").encode()
)
call_count = int((await readBody(count_resp)).decode())
assert call_count == 1
assert "browser_response" in item
assert "product" in item


# https://github.com/scrapy-plugins/scrapy-zyte-api/issues/91
@pytest.mark.xfail(reason="Not implemented yet", raises=AssertionError, strict=True)
@ensureDeferred
async def test_itemprovider_requests(mockserver):
async def test_itemprovider_requests_indirect_dependencies(fresh_mockserver):
class ItemDepSpider(ZyteAPISpider):
def parse_(self, response: DummyResponse, product: Product, my_item: MyItem): # type: ignore[override]
yield {
"product": product,
"my_item": my_item,
}

port = get_ephemeral_port()
handle_urls(f"{mockserver.host}:{port}")(MyPage)
handle_urls(f"{fresh_mockserver.host}:{port}")(MyPage)

settings = create_scrapy_settings(None)
settings.update(SETTINGS)
settings["ZYTE_API_URL"] = mockserver.urljoin("/")
settings["ZYTE_API_URL"] = fresh_mockserver.urljoin("/")
settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 1100}
item, url, _ = await crawl_single_item(
ItemDepSpider, HtmlResource, settings, port=port
)
count_resp = await Agent(reactor).request(
b"GET", mockserver.urljoin("/count").encode()
b"GET", fresh_mockserver.urljoin("/count").encode()
)
call_count = int((await readBody(count_resp)).decode())
assert call_count == 1
assert "my_item" in item
assert "product" in item


@ensureDeferred
async def test_itemprovider_requests_indirect_dependencies_workaround(fresh_mockserver):
class ItemDepSpider(ZyteAPISpider):
def parse_(self, response: DummyResponse, product: Product, browser_response: BrowserResponse, my_item: MyItem): # type: ignore[override]
yield {
"product": product,
"my_item": my_item,
"browser_response": browser_response,
}

port = get_ephemeral_port()
handle_urls(f"{fresh_mockserver.host}:{port}")(MyPage)

settings = create_scrapy_settings(None)
settings.update(SETTINGS)
settings["ZYTE_API_URL"] = fresh_mockserver.urljoin("/")
settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 1}
item, url, _ = await crawl_single_item(
ItemDepSpider, HtmlResource, settings, port=port
)
count_resp = await Agent(reactor).request(
b"GET", fresh_mockserver.urljoin("/count").encode()
)
call_count = int((await readBody(count_resp)).decode())
assert call_count == 1
assert "my_item" in item
assert "product" in item
assert "browser_response" in item

0 comments on commit 3c82d07

Please sign in to comment.