Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Google Search: Allow extracting items from SERP results #78

Merged
merged 12 commits into from
Nov 22, 2024
5 changes: 5 additions & 0 deletions docs/reference/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -44,5 +44,10 @@ Parameter mixins

.. autoenum:: zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategy

.. autopydantic_model:: zyte_spider_templates.spiders.serp.SerpItemTypeParam
:exclude-members: model_computed_fields

.. autoenum:: zyte_spider_templates.spiders.serp.SerpItemType

.. autopydantic_model:: zyte_spider_templates.spiders.serp.SerpMaxPagesParam
:exclude-members: model_computed_fields
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
"scrapy-poet>=0.24.0",
"scrapy-spider-metadata>=0.2.0",
"scrapy-zyte-api[provider]>=0.23.0",
"zyte-common-items>=0.23.0",
"zyte-common-items>=0.26.2",
],
classifiers=[
"Development Status :: 3 - Alpha",
Expand Down
109 changes: 106 additions & 3 deletions tests/test_serp.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,13 @@
from scrapy_spider_metadata import get_spider_metadata
from scrapy_zyte_api.responses import ZyteAPITextResponse
from w3lib.url import add_or_replace_parameter
from zyte_common_items import Product

from zyte_spider_templates.spiders.serp import GoogleSearchSpider
from zyte_spider_templates.spiders.serp import (
ITEM_TYPE_CLASSES,
GoogleSearchSpider,
SerpItemType,
)

from . import get_crawler
from .utils import assertEqualSpiderMetadata
Expand Down Expand Up @@ -259,6 +264,25 @@ def test_metadata():
"title": "Max Pages",
"type": "integer",
},
"item_type": {
"anyOf": [{"type": "string"}, {"type": "null"}],
"default": None,
"description": (
"If specified, follow organic search result links, "
"and extract the selected data type from the target "
"pages. Spider output items will be of the specified "
"data type, not search engine results page items."
),
"enum": [
"article",
"articleList",
"forumThread",
"jobPosting",
"product",
"productList",
],
"title": "Follow and Extract",
},
"max_requests": {
"anyOf": [{"type": "integer"}, {"type": "null"}],
"default": 100,
Expand Down Expand Up @@ -319,7 +343,9 @@ def test_search_queries():

def test_pagination():
crawler = get_crawler()
spider = GoogleSearchSpider.from_crawler(crawler, search_queries="foo bar")
spider = GoogleSearchSpider.from_crawler(
crawler, search_queries="foo bar", max_pages=3
)

def run_parse_serp(total_results, page=1):
url = "https://www.google.com/search?q=foo+bar"
Expand Down Expand Up @@ -388,6 +414,14 @@ def run_parse_serp(total_results, page=1):
assert requests[0].url == "https://www.google.com/search?q=foo+bar&start=20"
assert requests[0].cb_kwargs["page_number"] == 3

# Do not go over max_pages
items, requests = run_parse_serp(
total_results=31,
page=3,
)
assert len(items) == 1
assert len(requests) == 0


def test_get_serp_request():
crawler = get_crawler()
Expand All @@ -404,7 +438,9 @@ def test_get_serp_request():

def test_parse_serp():
crawler = get_crawler()
spider = GoogleSearchSpider.from_crawler(crawler, search_queries="foo bar")
spider = GoogleSearchSpider.from_crawler(
crawler, search_queries="foo bar", max_pages=43
)
url = "https://www.google.com/search?q=foo+bar"
response = ZyteAPITextResponse.from_api_response(
api_response={
Expand Down Expand Up @@ -445,3 +481,70 @@ def test_parse_serp():
# The page_number parameter is required.
with pytest.raises(TypeError):
spider.parse_serp(response)


def test_item_type():
crawler = get_crawler()
spider = GoogleSearchSpider.from_crawler(
crawler, search_queries="foo bar", max_pages=43, item_type="product"
)
url = "https://www.google.com/search?q=foo+bar"
response = ZyteAPITextResponse.from_api_response(
api_response={
"serp": {
"organicResults": [
{
"description": "…",
"name": "…",
"url": f"https://example.com/{rank}",
"rank": rank,
}
for rank in range(1, 11)
],
"metadata": {
"dateDownloaded": "2024-10-25T08:59:45Z",
"displayedQuery": "foo bar",
"searchedQuery": "foo bar",
"totalOrganicResults": 99999,
},
"pageNumber": 1,
"url": url,
},
"url": url,
},
)
items = []
requests = []
for item_or_request in spider.parse_serp(response, page_number=42):
if isinstance(item_or_request, Request):
requests.append(item_or_request)
else:
items.append(item_or_request)
assert len(items) == 0
assert len(requests) == 11

assert requests[0].url == add_or_replace_parameter(url, "start", "420")
assert requests[0].cb_kwargs["page_number"] == 43

for rank in range(1, 11):
assert requests[rank].url == f"https://example.com/{rank}"
assert requests[rank].callback == spider.parse_result
assert requests[rank].meta == {
"crawling_logs": {"page_type": "product"},
"inject": [Product],
}


def test_item_type_mappings():
# Ensure that all SerpItemType keys and values match.
for entry in SerpItemType:
assert entry.name == entry.value

# Ensure that the ITEM_TYPE_CLASSES dict maps all values from the
# corresponding enum except for serp.
actual_keys = set(ITEM_TYPE_CLASSES)
expected_keys = set(entry.value for entry in SerpItemType)
assert actual_keys == expected_keys

# Also ensure that no dict value is repeated.
assert len(actual_keys) == len(set(ITEM_TYPE_CLASSES.values()))
2 changes: 1 addition & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ deps =
scrapy-poet==0.24.0
scrapy-spider-metadata==0.2.0
scrapy-zyte-api[provider]==0.23.0
zyte-common-items==0.23.0
zyte-common-items==0.26.2

[testenv:mypy]
deps =
Expand Down
100 changes: 93 additions & 7 deletions zyte_spider_templates/spiders/serp.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,23 @@
from enum import Enum
from typing import Any, Dict, Iterable, List, Optional, Union

from pydantic import BaseModel, Field, field_validator
from scrapy import Request
from scrapy.settings import SETTINGS_PRIORITIES, BaseSettings
from scrapy_poet import DummyResponse, DynamicDeps
from scrapy_spider_metadata import Args
from w3lib.url import add_or_replace_parameter
from zyte_common_items import Serp

from zyte_common_items import (
Article,
ArticleList,
ForumThread,
JobPosting,
Product,
ProductList,
Serp,
)

from ..documentation import document_enum
from ..params import MaxRequestsParam
from ._google_domains import GoogleDomain
from .base import BaseSpider
Expand Down Expand Up @@ -48,6 +59,62 @@ class SerpMaxPagesParam(BaseModel):
)


@document_enum
class SerpItemType(str, Enum):
article: str = "article"
"""
Article data.
"""

articleList: str = "articleList"
"""
Article list data.
"""

forumThread: str = "forumThread"
"""
Forum thread data.
"""

jobPosting: str = "jobPosting"
"""
Job posting data.
"""

product: str = "product"
"""
Product data.
"""

productList: str = "productList"
"""
Product list data.
"""


ITEM_TYPE_CLASSES = {
SerpItemType.article: Article,
SerpItemType.articleList: ArticleList,
SerpItemType.forumThread: ForumThread,
SerpItemType.jobPosting: JobPosting,
SerpItemType.product: Product,
SerpItemType.productList: ProductList,
}


class SerpItemTypeParam(BaseModel):
item_type: Optional[SerpItemType] = Field(
title="Follow and Extract",
description=(
"If specified, follow organic search result links, and extract "
"the selected data type from the target pages. Spider output "
"items will be of the specified data type, not search engine "
"results page items."
),
default=None,
)


class GoogleDomainParam(BaseModel):
domain: GoogleDomain = Field(
title="Domain",
Expand All @@ -58,6 +125,7 @@ class GoogleDomainParam(BaseModel):

class GoogleSearchSpiderParams(
MaxRequestsParam,
SerpItemTypeParam,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we provide extract_from option, which applies only for the items extracted from the linked websites, not to search results extraction itself?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doesn't need to be in the same PR though.
We also should add custom attributes extraction, probably also separately.

SerpMaxPagesParam,
SearchQueriesParam,
GoogleDomainParam,
Expand Down Expand Up @@ -126,9 +194,27 @@ def start_requests(self) -> Iterable[Request]:
def parse_serp(self, response, page_number) -> Iterable[Union[Request, Serp]]:
serp = Serp.from_dict(response.raw_api_response["serp"])

next_start = page_number * self._results_per_page
if serp.organicResults and serp.metadata.totalOrganicResults > next_start:
next_url = add_or_replace_parameter(serp.url, "start", str(next_start))
yield self.get_serp_request(next_url, page_number=page_number + 1)
if page_number < self.args.max_pages:
next_start = page_number * self._results_per_page
if serp.organicResults and serp.metadata.totalOrganicResults > next_start:
next_url = add_or_replace_parameter(serp.url, "start", str(next_start))
yield self.get_serp_request(next_url, page_number=page_number + 1)

if self.args.item_type is None:
yield serp
return

for result in serp.organicResults:
yield response.follow(
result.url,
callback=self.parse_result,
meta={
"crawling_logs": {"page_type": self.args.item_type.value},
"inject": [ITEM_TYPE_CLASSES[self.args.item_type]],
},
)

yield serp
def parse_result(
self, response: DummyResponse, dynamic: DynamicDeps
) -> Iterable[Any]:
yield next(iter(dynamic.values()))
Loading