Skip to content

Commit

Permalink
Google Search: add geolocation parameters (#80)
Browse files Browse the repository at this point in the history
  • Loading branch information
Gallaecio authored Nov 22, 2024
1 parent 317f69b commit b6b33ea
Show file tree
Hide file tree
Showing 12 changed files with 1,008 additions and 182 deletions.
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ ignore_missing_imports = true

[tool.black]
target-version = ["py38", "py39", "py310", "py311", "py312"]
force-exclude = "template.py"
3 changes: 3 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ ignore =
# First line should not be the function's "signature"
D402

exclude =
template.py

per-file-ignores =
# F401: Ignore "imported but unused" errors in __init__ files, as those
# imports are there to expose submodule functions so they can be imported
Expand Down
141 changes: 2 additions & 139 deletions tests/test_ecommerce.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import pytest
import requests
import scrapy
from pydantic import ValidationError
from scrapy_poet import DummyResponse, DynamicDeps
from scrapy_spider_metadata import get_spider_metadata
from web_poet.page_inputs.browser import BrowserResponse
Expand All @@ -21,43 +20,13 @@
GEOLOCATION_OPTIONS_WITH_CODE,
Geolocation,
)
from zyte_spider_templates.spiders.ecommerce import (
EcommerceCrawlStrategy,
EcommerceSpider,
)
from zyte_spider_templates.spiders.ecommerce import EcommerceSpider

from . import get_crawler
from .test_utils import URL_TO_DOMAIN
from .utils import assertEqualSpiderMetadata


def test_parameters():
with pytest.raises(ValidationError):
EcommerceSpider()

EcommerceSpider(url="https://example.com")
EcommerceSpider(
url="https://example.com", crawl_strategy=EcommerceCrawlStrategy.automatic
)
EcommerceSpider(url="https://example.com", crawl_strategy="automatic")

with pytest.raises(ValidationError):
EcommerceSpider(url="https://example.com", crawl_strategy="unknown")

EcommerceSpider(
url="https://example.com", crawl_strategy="direct_item", search_queries=""
)
EcommerceSpider(
url="https://example.com", crawl_strategy="automatic", search_queries="foo"
)
with pytest.raises(ValidationError):
EcommerceSpider(
url="https://example.com",
crawl_strategy="direct_item",
search_queries="foo",
)


def test_start_requests():
url = "https://example.com"
crawler = get_crawler()
Expand Down Expand Up @@ -305,108 +274,6 @@ def test_parse_search_request_template_probability(probability, yields_items):
assert items if yields_items else not items


def test_arguments():
# Ensure passing no arguments works.
crawler = get_crawler()

# Needed since it's a required argument.
base_kwargs = {"url": "https://example.com"}

EcommerceSpider.from_crawler(crawler, **base_kwargs)

for param, arg, setting, old_setting_value, getter_name, new_setting_value in (
("max_requests", "123", "ZYTE_API_MAX_REQUESTS", None, "getint", 123),
(
"geolocation",
"DE",
"ZYTE_API_AUTOMAP_PARAMS",
None,
"getdict",
{"geolocation": "DE"},
),
(
"geolocation",
"DE",
"ZYTE_API_AUTOMAP_PARAMS",
'{"browserHtml": true}',
"getdict",
{"browserHtml": True, "geolocation": "DE"},
),
(
"geolocation",
"DE",
"ZYTE_API_AUTOMAP_PARAMS",
'{"geolocation": "IE"}',
"getdict",
{"geolocation": "DE"},
),
(
"geolocation",
"DE",
"ZYTE_API_PROVIDER_PARAMS",
None,
"getdict",
{"geolocation": "DE"},
),
(
"geolocation",
"DE",
"ZYTE_API_PROVIDER_PARAMS",
'{"browserHtml": true}',
"getdict",
{"browserHtml": True, "geolocation": "DE"},
),
(
"geolocation",
"DE",
"ZYTE_API_PROVIDER_PARAMS",
'{"geolocation": "IE"}',
"getdict",
{"geolocation": "DE"},
),
(
"extract_from",
"browserHtml",
"ZYTE_API_PROVIDER_PARAMS",
None,
"getdict",
{
"productOptions": {"extractFrom": "browserHtml"},
"productNavigationOptions": {"extractFrom": "browserHtml"},
},
),
(
"extract_from",
"httpResponseBody",
"ZYTE_API_PROVIDER_PARAMS",
{"geolocation": "US"},
"getdict",
{
"productOptions": {"extractFrom": "httpResponseBody"},
"productNavigationOptions": {"extractFrom": "httpResponseBody"},
"geolocation": "US",
},
),
(
"extract_from",
None,
"ZYTE_API_PROVIDER_PARAMS",
{"geolocation": "US"},
"getdict",
{"geolocation": "US"},
),
):
kwargs = {param: arg}
settings = {}
if old_setting_value is not None:
settings[setting] = old_setting_value
crawler = get_crawler(settings=settings)
spider = EcommerceSpider.from_crawler(crawler, **kwargs, **base_kwargs)
getter = getattr(crawler.settings, getter_name)
assert getter(setting) == new_setting_value
assert spider.allowed_domains == ["example.com"] # type: ignore[attr-defined]


def test_metadata():
actual_metadata = get_spider_metadata(EcommerceSpider, normalize=True)
expected_metadata = {
Expand Down Expand Up @@ -550,11 +417,7 @@ def test_metadata():
{"type": "null"},
],
"default": None,
"description": (
"ISO 3166-1 alpha-2 2-character string specified in "
"https://docs.zyte.com/zyte-api/usage/reference.html"
"#operation/extract/request/geolocation."
),
"description": "Country of the IP addresses to use.",
"enumMeta": {
code: {
"title": GEOLOCATION_OPTIONS_WITH_CODE[code],
Expand Down
Loading

0 comments on commit b6b33ea

Please sign in to comment.