Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Google Search: hl support #81

Merged
merged 3 commits into from
Nov 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 70 additions & 0 deletions tests/test_serp.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@
GOOGLE_GL_OPTIONS_WITH_CODE,
GoogleGl,
)
from zyte_spider_templates.spiders._google_hl import (
GOOGLE_HL_OPTIONS,
GOOGLE_HL_OPTIONS_WITH_CODE,
GoogleHl,
)
from zyte_spider_templates.spiders.serp import (
ITEM_TYPE_CLASSES,
GoogleSearchSpider,
Expand Down Expand Up @@ -393,6 +398,41 @@ def test_metadata():
sorted(GEOLOCATION_OPTIONS, key=GEOLOCATION_OPTIONS.__getitem__)
),
},
"hl": {
"anyOf": [
{"type": "string"},
{"type": "null"},
],
"default": None,
"description": (
"User interface language, which can affect search "
"results. See "
"https://developers.google.com/custom-search/v1/reference/rest/v1/cse/list#body.QUERY_PARAMETERS.hl"
),
"enumMeta": {
code: {
"title": GOOGLE_HL_OPTIONS_WITH_CODE[code],
}
for code in sorted(GoogleHl)
},
"title": "User Language",
"enum": list(
sorted(GOOGLE_HL_OPTIONS, key=GOOGLE_HL_OPTIONS.__getitem__)
),
},
"lr": {
"anyOf": [
{"type": "string"},
{"type": "null"},
],
"default": None,
"description": (
"Restricts search results to documents written in the "
"specified languages. See "
"https://developers.google.com/custom-search/v1/reference/rest/v1/cse/list#body.QUERY_PARAMETERS.lr"
),
"title": "Content Languages",
},
},
"required": ["search_queries"],
"title": "GoogleSearchSpiderParams",
Expand Down Expand Up @@ -576,6 +616,36 @@ def test_parse_serp():
spider.parse_serp(response) # type: ignore[call-arg]


def test_hl():
crawler = get_crawler()
spider = GoogleSearchSpider.from_crawler(
crawler, search_queries="foo", hl="gl", max_pages=2
)
requests = list(spider.start_requests())
assert len(requests) == 1
assert requests[0].url == "https://www.google.com/search?q=foo&hl=gl"

items, requests = run_parse_serp(spider)
assert len(items) == 1
assert len(requests) == 1
assert requests[0].url == "https://www.google.com/search?q=foo&start=10&hl=gl"


def test_lr():
crawler = get_crawler()
spider = GoogleSearchSpider.from_crawler(
crawler, search_queries="foo", lr="lang_ja", max_pages=2
)
requests = list(spider.start_requests())
assert len(requests) == 1
assert requests[0].url == "https://www.google.com/search?q=foo&lr=lang_ja"

items, requests = run_parse_serp(spider)
assert len(items) == 1
assert len(requests) == 1
assert requests[0].url == "https://www.google.com/search?q=foo&start=10&lr=lang_ja"


def test_cr():
crawler = get_crawler()
spider = GoogleSearchSpider.from_crawler(
Expand Down
3 changes: 3 additions & 0 deletions utils/google-hl-updater/requirements.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
jinja2
parsel
requests
32 changes: 32 additions & 0 deletions utils/google-hl-updater/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#
# This file is autogenerated by pip-compile with Python 3.12
# by the following command:
#
# pip-compile
#
certifi==2024.8.30
# via requests
charset-normalizer==3.4.0
# via requests
cssselect==1.2.0
# via parsel
idna==3.10
# via requests
jinja2==3.1.4
# via -r requirements.in
jmespath==1.0.1
# via parsel
lxml==5.3.0
# via parsel
markupsafe==3.0.2
# via jinja2
packaging==24.2
# via parsel
parsel==1.9.1
# via -r requirements.in
requests==2.32.3
# via -r requirements.in
urllib3==2.2.3
# via requests
w3lib==2.2.1
# via parsel
18 changes: 18 additions & 0 deletions utils/google-hl-updater/template.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{% raw %}# _google_gl.py counterpart for
# https://developers.google.com/custom-search/docs/json_api_reference#interfaceLanguages
#
# Built automatically with ../../utils/google-hl-updater

from enum import Enum

GOOGLE_HL_OPTIONS = {{% endraw %}{% for language in languages %}
"{{ language.code }}": "{{ language.name }}",{% endfor %}{% raw %}
}
GOOGLE_HL_OPTIONS_WITH_CODE = {
code: f"{name} ({code})" for code, name in GOOGLE_HL_OPTIONS.items()
}


class GoogleHl(str, Enum):{% endraw %}{% for language in languages %}
{{ language.keyword }}: str = "{{ language.code }}"{% endfor %}

38 changes: 38 additions & 0 deletions utils/google-hl-updater/update.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from keyword import iskeyword
from pathlib import Path

import jinja2
import requests
from parsel import Selector

languages = []

response = requests.get(
"https://developers.google.com/custom-search/docs/json_api_reference"
)
selector = Selector(text=response.text)
table = selector.xpath(
'//*[@id="supported-interface-languages"]/following-sibling::table[1]'
)
for tr in table.css("tr"):
name = tr.xpath("td/text()").get()
if not name: # header
continue
code = tr.xpath("td/span/text()").get()
keyword = f"{code}_" if iskeyword(code) else code
keyword = keyword.replace("-", "_")
languages.append({"code": code, "keyword": keyword, "name": name})

template_path = Path(__file__).parent / "template.py"
template_environment = jinja2.Environment()
with template_path.open() as f:
template = template_environment.from_string(f.read())
output = template.render(languages=languages)
output_path = (
Path(__file__).parent.parent.parent
/ "zyte_spider_templates"
/ "spiders"
/ "_google_hl.py"
)
with output_path.open("w") as f:
f.write(output)
181 changes: 181 additions & 0 deletions zyte_spider_templates/spiders/_google_hl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
# _google_gl.py counterpart for
# https://developers.google.com/custom-search/docs/json_api_reference#interfaceLanguages
#
# Built automatically with ../../utils/google-hl-updater

from enum import Enum

GOOGLE_HL_OPTIONS = {
"af": "Afrikaans",
"sq": "Albanian",
"sm": "Amharic",
"ar": "Arabic",
"az": "Azerbaijani",
"eu": "Basque",
"be": "Belarusian",
"bn": "Bengali",
"bh": "Bihari",
"bs": "Bosnian",
"bg": "Bulgarian",
"ca": "Catalan",
"zh-CN": "Chinese (Simplified)",
"zh-TW": "Chinese (Traditional)",
"hr": "Croatian",
"cs": "Czech",
"da": "Danish",
"nl": "Dutch",
"en": "English",
"eo": "Esperanto",
"et": "Estonian",
"fo": "Faroese",
"fi": "Finnish",
"fr": "French",
"fy": "Frisian",
"gl": "Galician",
"ka": "Georgian",
"de": "German",
"el": "Greek",
"gu": "Gujarati",
"iw": "Hebrew",
"hi": "Hindi",
"hu": "Hungarian",
"is": "Icelandic",
"id": "Indonesian",
"ia": "Interlingua",
"ga": "Irish",
"it": "Italian",
"ja": "Japanese",
"jw": "Javanese",
"kn": "Kannada",
"ko": "Korean",
"la": "Latin",
"lv": "Latvian",
"lt": "Lithuanian",
"mk": "Macedonian",
"ms": "Malay",
"ml": "Malayam",
"mt": "Maltese",
"mr": "Marathi",
"ne": "Nepali",
"no": "Norwegian",
"nn": "Norwegian (Nynorsk)",
"oc": "Occitan",
"fa": "Persian",
"pl": "Polish",
"pt-BR": "Portuguese (Brazil)",
"pt-PT": "Portuguese (Portugal)",
"pa": "Punjabi",
"ro": "Romanian",
"ru": "Russian",
"gd": "Scots Gaelic",
"sr": "Serbian",
"si": "Sinhalese",
"sk": "Slovak",
"sl": "Slovenian",
"es": "Spanish",
"su": "Sudanese",
"sw": "Swahili",
"sv": "Swedish",
"tl": "Tagalog",
"ta": "Tamil",
"te": "Telugu",
"th": "Thai",
"ti": "Tigrinya",
"tr": "Turkish",
"uk": "Ukrainian",
"ur": "Urdu",
"uz": "Uzbek",
"vi": "Vietnamese",
"cy": "Welsh",
"xh": "Xhosa",
"zu": "Zulu",
}
GOOGLE_HL_OPTIONS_WITH_CODE = {
code: f"{name} ({code})" for code, name in GOOGLE_HL_OPTIONS.items()
}


class GoogleHl(str, Enum):
af: str = "af"
sq: str = "sq"
sm: str = "sm"
ar: str = "ar"
az: str = "az"
eu: str = "eu"
be: str = "be"
bn: str = "bn"
bh: str = "bh"
bs: str = "bs"
bg: str = "bg"
ca: str = "ca"
zh_CN: str = "zh-CN"
zh_TW: str = "zh-TW"
hr: str = "hr"
cs: str = "cs"
da: str = "da"
nl: str = "nl"
en: str = "en"
eo: str = "eo"
et: str = "et"
fo: str = "fo"
fi: str = "fi"
fr: str = "fr"
fy: str = "fy"
gl: str = "gl"
ka: str = "ka"
de: str = "de"
el: str = "el"
gu: str = "gu"
iw: str = "iw"
hi: str = "hi"
hu: str = "hu"
is_: str = "is"
id: str = "id"
ia: str = "ia"
ga: str = "ga"
it: str = "it"
ja: str = "ja"
jw: str = "jw"
kn: str = "kn"
ko: str = "ko"
la: str = "la"
lv: str = "lv"
lt: str = "lt"
mk: str = "mk"
ms: str = "ms"
ml: str = "ml"
mt: str = "mt"
mr: str = "mr"
ne: str = "ne"
no: str = "no"
nn: str = "nn"
oc: str = "oc"
fa: str = "fa"
pl: str = "pl"
pt_BR: str = "pt-BR"
pt_PT: str = "pt-PT"
pa: str = "pa"
ro: str = "ro"
ru: str = "ru"
gd: str = "gd"
sr: str = "sr"
si: str = "si"
sk: str = "sk"
sl: str = "sl"
es: str = "es"
su: str = "su"
sw: str = "sw"
sv: str = "sv"
tl: str = "tl"
ta: str = "ta"
te: str = "te"
th: str = "th"
ti: str = "ti"
tr: str = "tr"
uk: str = "uk"
ur: str = "ur"
uz: str = "uz"
vi: str = "vi"
cy: str = "cy"
xh: str = "xh"
zu: str = "zu"
Loading