Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cache provider output per request #94

Merged
merged 1 commit into from
Jun 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 32 additions & 11 deletions scrapy_zyte_api/providers.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from typing import Any, Callable, List, Sequence, Set
from typing import Any, Callable, Dict, List, Sequence, Set, Type
from weakref import WeakKeyDictionary

from scrapy import Request
from scrapy.crawler import Crawler
Expand All @@ -21,11 +22,29 @@ class ZyteApiProvider(PageObjectInputProvider):

provided_classes = {BrowserResponse, BrowserHtml, Product}

def __init__(self, injector):
super().__init__(injector)
self._cached_instances: WeakKeyDictionary[Request, Dict] = WeakKeyDictionary()
Copy link
Member

@kmike kmike Jun 16, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We have a similar solution in ItemProvider. I wonder if it's possible to move this cache to Injector instead, so that there is a single cache which is shared across all providers.

The first step could be using the same cached_istances storage, and keeping all cache handling code as-is. But maybe it'd be possible to improve on this further, e.g. don't pass a class to to_provide if an instance is in the cache, skip a provider if all to_provide classes are in cache, and also automatically add results to the cache.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I had this in my to-do, but I wonder if we need it now that ItemProvider is gone. I am not sure in which scenario the injector a provider would get the same Request + deps now, given duplicate request filtering.


def update_cache(self, request: Request, mapping: Dict[Type, Any]) -> None:
if request not in self._cached_instances:
self._cached_instances[request] = {}
self._cached_instances[request].update(mapping)

async def __call__(
self, to_provide: Set[Callable], request: Request, crawler: Crawler
) -> Sequence[Any]:
"""Makes a Zyte API request to provide BrowserResponse and/or item dependencies."""
# TODO what if ``response`` is already from Zyte API and contains something we need
results: List[Any] = []

for cls in list(to_provide):
item = self._cached_instances.get(request, {}).get(cls)
if item:
results.append(item)
to_provide.remove(cls)
if not to_provide:
return results

html_requested = BrowserResponse in to_provide or BrowserHtml in to_provide
item_keywords = {Product: "product"}
Expand All @@ -36,7 +55,7 @@ async def __call__(
for item_type, kw in item_keywords.items():
if item_type in to_provide:
zyte_api_meta[kw] = True
request = Request(
api_request = Request(
url=request.url,
meta={
"zyte_api": zyte_api_meta,
Expand All @@ -45,26 +64,28 @@ async def __call__(
callback=NO_CALLBACK,
)
api_response: ZyteAPITextResponse = await maybe_deferred_to_future(
crawler.engine.download(request)
crawler.engine.download(api_request)
)

assert api_response.raw_api_response
results: List[Any] = []
if html_requested:
html = BrowserHtml(api_response.raw_api_response["browserHtml"])
else:
html = None
if BrowserHtml in to_provide:
results.append(html)
self.update_cache(request, {BrowserHtml: html})
if BrowserResponse in to_provide:
results.append(
BrowserResponse(
url=api_response.url,
status=api_response.status,
html=html,
)
response = BrowserResponse(
url=api_response.url,
status=api_response.status,
html=html,
)
results.append(response)
self.update_cache(request, {BrowserResponse: response})
for item_type, kw in item_keywords.items():
if item_type in to_provide:
results.append(item_type.from_dict(api_response.raw_api_response[kw]))
item = item_type.from_dict(api_response.raw_api_response[kw])
results.append(item)
self.update_cache(request, {item_type: item})
return results
8 changes: 8 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,11 @@ def mockserver():

with MockServer() as server:
yield server


@pytest.fixture(scope="function")
def fresh_mockserver():
from .mockserver import MockServer

with MockServer() as server:
yield server
82 changes: 72 additions & 10 deletions tests/test_providers.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,32 +73,94 @@ def url(self) -> str:
return str(self.response.url)


class ItemDepSpider(ZyteAPISpider):
def parse_(self, response: DummyResponse, product: Product, my_item: MyItem): # type: ignore[override]
yield {
"product": product,
"my_item": my_item,
}
@ensureDeferred
async def test_itemprovider_requests_direct_dependencies(fresh_mockserver):
class ItemDepSpider(ZyteAPISpider):
def parse_( # type: ignore[override]
self,
response: DummyResponse,
browser_response: BrowserResponse,
product: Product,
):
yield {
"product": product,
"browser_response": browser_response,
}

port = get_ephemeral_port()
handle_urls(f"{fresh_mockserver.host}:{port}")(MyPage)

settings = create_scrapy_settings(None)
settings.update(SETTINGS)
settings["ZYTE_API_URL"] = fresh_mockserver.urljoin("/")
settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 1100}
item, url, _ = await crawl_single_item(
ItemDepSpider, HtmlResource, settings, port=port
)
count_resp = await Agent(reactor).request(
b"GET", fresh_mockserver.urljoin("/count").encode()
)
call_count = int((await readBody(count_resp)).decode())
assert call_count == 1
assert "browser_response" in item
assert "product" in item


# https://github.com/scrapy-plugins/scrapy-zyte-api/issues/91
@pytest.mark.xfail(reason="Not implemented yet", raises=AssertionError, strict=True)
@ensureDeferred
async def test_itemprovider_requests(mockserver):
async def test_itemprovider_requests_indirect_dependencies(fresh_mockserver):
class ItemDepSpider(ZyteAPISpider):
def parse_(self, response: DummyResponse, product: Product, my_item: MyItem): # type: ignore[override]
yield {
"product": product,
"my_item": my_item,
}

port = get_ephemeral_port()
handle_urls(f"{mockserver.host}:{port}")(MyPage)
handle_urls(f"{fresh_mockserver.host}:{port}")(MyPage)

settings = create_scrapy_settings(None)
settings.update(SETTINGS)
settings["ZYTE_API_URL"] = mockserver.urljoin("/")
settings["ZYTE_API_URL"] = fresh_mockserver.urljoin("/")
settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 1100}
item, url, _ = await crawl_single_item(
ItemDepSpider, HtmlResource, settings, port=port
)
count_resp = await Agent(reactor).request(
b"GET", mockserver.urljoin("/count").encode()
b"GET", fresh_mockserver.urljoin("/count").encode()
)
call_count = int((await readBody(count_resp)).decode())
assert call_count == 1
assert "my_item" in item
assert "product" in item


@ensureDeferred
async def test_itemprovider_requests_indirect_dependencies_workaround(fresh_mockserver):
class ItemDepSpider(ZyteAPISpider):
def parse_(self, response: DummyResponse, product: Product, browser_response: BrowserResponse, my_item: MyItem): # type: ignore[override]
yield {
"product": product,
"my_item": my_item,
"browser_response": browser_response,
}

port = get_ephemeral_port()
handle_urls(f"{fresh_mockserver.host}:{port}")(MyPage)

settings = create_scrapy_settings(None)
settings.update(SETTINGS)
settings["ZYTE_API_URL"] = fresh_mockserver.urljoin("/")
settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 1}
item, url, _ = await crawl_single_item(
ItemDepSpider, HtmlResource, settings, port=port
)
count_resp = await Agent(reactor).request(
b"GET", fresh_mockserver.urljoin("/count").encode()
)
call_count = int((await readBody(count_resp)).decode())
assert call_count == 1
assert "my_item" in item
assert "product" in item
assert "browser_response" in item