Skip to content

Commit

Permalink
Google Search: add language support (#81)
Browse files Browse the repository at this point in the history
  • Loading branch information
Gallaecio authored Nov 22, 2024
1 parent b6b33ea commit 8e0eefb
Show file tree
Hide file tree
Showing 7 changed files with 390 additions and 6 deletions.
70 changes: 70 additions & 0 deletions tests/test_serp.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@
GOOGLE_GL_OPTIONS_WITH_CODE,
GoogleGl,
)
from zyte_spider_templates.spiders._google_hl import (
GOOGLE_HL_OPTIONS,
GOOGLE_HL_OPTIONS_WITH_CODE,
GoogleHl,
)
from zyte_spider_templates.spiders.serp import (
ITEM_TYPE_CLASSES,
GoogleSearchSpider,
Expand Down Expand Up @@ -393,6 +398,41 @@ def test_metadata():
sorted(GEOLOCATION_OPTIONS, key=GEOLOCATION_OPTIONS.__getitem__)
),
},
"hl": {
"anyOf": [
{"type": "string"},
{"type": "null"},
],
"default": None,
"description": (
"User interface language, which can affect search "
"results. See "
"https://developers.google.com/custom-search/v1/reference/rest/v1/cse/list#body.QUERY_PARAMETERS.hl"
),
"enumMeta": {
code: {
"title": GOOGLE_HL_OPTIONS_WITH_CODE[code],
}
for code in sorted(GoogleHl)
},
"title": "User Language",
"enum": list(
sorted(GOOGLE_HL_OPTIONS, key=GOOGLE_HL_OPTIONS.__getitem__)
),
},
"lr": {
"anyOf": [
{"type": "string"},
{"type": "null"},
],
"default": None,
"description": (
"Restricts search results to documents written in the "
"specified languages. See "
"https://developers.google.com/custom-search/v1/reference/rest/v1/cse/list#body.QUERY_PARAMETERS.lr"
),
"title": "Content Languages",
},
},
"required": ["search_queries"],
"title": "GoogleSearchSpiderParams",
Expand Down Expand Up @@ -576,6 +616,36 @@ def test_parse_serp():
spider.parse_serp(response) # type: ignore[call-arg]


def test_hl():
crawler = get_crawler()
spider = GoogleSearchSpider.from_crawler(
crawler, search_queries="foo", hl="gl", max_pages=2
)
requests = list(spider.start_requests())
assert len(requests) == 1
assert requests[0].url == "https://www.google.com/search?q=foo&hl=gl"

items, requests = run_parse_serp(spider)
assert len(items) == 1
assert len(requests) == 1
assert requests[0].url == "https://www.google.com/search?q=foo&start=10&hl=gl"


def test_lr():
crawler = get_crawler()
spider = GoogleSearchSpider.from_crawler(
crawler, search_queries="foo", lr="lang_ja", max_pages=2
)
requests = list(spider.start_requests())
assert len(requests) == 1
assert requests[0].url == "https://www.google.com/search?q=foo&lr=lang_ja"

items, requests = run_parse_serp(spider)
assert len(items) == 1
assert len(requests) == 1
assert requests[0].url == "https://www.google.com/search?q=foo&start=10&lr=lang_ja"


def test_cr():
crawler = get_crawler()
spider = GoogleSearchSpider.from_crawler(
Expand Down
3 changes: 3 additions & 0 deletions utils/google-hl-updater/requirements.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
jinja2
parsel
requests
32 changes: 32 additions & 0 deletions utils/google-hl-updater/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#
# This file is autogenerated by pip-compile with Python 3.12
# by the following command:
#
# pip-compile
#
certifi==2024.8.30
# via requests
charset-normalizer==3.4.0
# via requests
cssselect==1.2.0
# via parsel
idna==3.10
# via requests
jinja2==3.1.4
# via -r requirements.in
jmespath==1.0.1
# via parsel
lxml==5.3.0
# via parsel
markupsafe==3.0.2
# via jinja2
packaging==24.2
# via parsel
parsel==1.9.1
# via -r requirements.in
requests==2.32.3
# via -r requirements.in
urllib3==2.2.3
# via requests
w3lib==2.2.1
# via parsel
18 changes: 18 additions & 0 deletions utils/google-hl-updater/template.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{% raw %}# _google_gl.py counterpart for
# https://developers.google.com/custom-search/docs/json_api_reference#interfaceLanguages
#
# Built automatically with ../../utils/google-hl-updater

from enum import Enum

GOOGLE_HL_OPTIONS = {{% endraw %}{% for language in languages %}
"{{ language.code }}": "{{ language.name }}",{% endfor %}{% raw %}
}
GOOGLE_HL_OPTIONS_WITH_CODE = {
code: f"{name} ({code})" for code, name in GOOGLE_HL_OPTIONS.items()
}


class GoogleHl(str, Enum):{% endraw %}{% for language in languages %}
{{ language.keyword }}: str = "{{ language.code }}"{% endfor %}

38 changes: 38 additions & 0 deletions utils/google-hl-updater/update.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from keyword import iskeyword
from pathlib import Path

import jinja2
import requests
from parsel import Selector

languages = []

response = requests.get(
"https://developers.google.com/custom-search/docs/json_api_reference"
)
selector = Selector(text=response.text)
table = selector.xpath(
'//*[@id="supported-interface-languages"]/following-sibling::table[1]'
)
for tr in table.css("tr"):
name = tr.xpath("td/text()").get()
if not name: # header
continue
code = tr.xpath("td/span/text()").get()
keyword = f"{code}_" if iskeyword(code) else code
keyword = keyword.replace("-", "_")
languages.append({"code": code, "keyword": keyword, "name": name})

template_path = Path(__file__).parent / "template.py"
template_environment = jinja2.Environment()
with template_path.open() as f:
template = template_environment.from_string(f.read())
output = template.render(languages=languages)
output_path = (
Path(__file__).parent.parent.parent
/ "zyte_spider_templates"
/ "spiders"
/ "_google_hl.py"
)
with output_path.open("w") as f:
f.write(output)
181 changes: 181 additions & 0 deletions zyte_spider_templates/spiders/_google_hl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
# _google_gl.py counterpart for
# https://developers.google.com/custom-search/docs/json_api_reference#interfaceLanguages
#
# Built automatically with ../../utils/google-hl-updater

from enum import Enum

GOOGLE_HL_OPTIONS = {
"af": "Afrikaans",
"sq": "Albanian",
"sm": "Amharic",
"ar": "Arabic",
"az": "Azerbaijani",
"eu": "Basque",
"be": "Belarusian",
"bn": "Bengali",
"bh": "Bihari",
"bs": "Bosnian",
"bg": "Bulgarian",
"ca": "Catalan",
"zh-CN": "Chinese (Simplified)",
"zh-TW": "Chinese (Traditional)",
"hr": "Croatian",
"cs": "Czech",
"da": "Danish",
"nl": "Dutch",
"en": "English",
"eo": "Esperanto",
"et": "Estonian",
"fo": "Faroese",
"fi": "Finnish",
"fr": "French",
"fy": "Frisian",
"gl": "Galician",
"ka": "Georgian",
"de": "German",
"el": "Greek",
"gu": "Gujarati",
"iw": "Hebrew",
"hi": "Hindi",
"hu": "Hungarian",
"is": "Icelandic",
"id": "Indonesian",
"ia": "Interlingua",
"ga": "Irish",
"it": "Italian",
"ja": "Japanese",
"jw": "Javanese",
"kn": "Kannada",
"ko": "Korean",
"la": "Latin",
"lv": "Latvian",
"lt": "Lithuanian",
"mk": "Macedonian",
"ms": "Malay",
"ml": "Malayam",
"mt": "Maltese",
"mr": "Marathi",
"ne": "Nepali",
"no": "Norwegian",
"nn": "Norwegian (Nynorsk)",
"oc": "Occitan",
"fa": "Persian",
"pl": "Polish",
"pt-BR": "Portuguese (Brazil)",
"pt-PT": "Portuguese (Portugal)",
"pa": "Punjabi",
"ro": "Romanian",
"ru": "Russian",
"gd": "Scots Gaelic",
"sr": "Serbian",
"si": "Sinhalese",
"sk": "Slovak",
"sl": "Slovenian",
"es": "Spanish",
"su": "Sudanese",
"sw": "Swahili",
"sv": "Swedish",
"tl": "Tagalog",
"ta": "Tamil",
"te": "Telugu",
"th": "Thai",
"ti": "Tigrinya",
"tr": "Turkish",
"uk": "Ukrainian",
"ur": "Urdu",
"uz": "Uzbek",
"vi": "Vietnamese",
"cy": "Welsh",
"xh": "Xhosa",
"zu": "Zulu",
}
GOOGLE_HL_OPTIONS_WITH_CODE = {
code: f"{name} ({code})" for code, name in GOOGLE_HL_OPTIONS.items()
}


class GoogleHl(str, Enum):
af: str = "af"
sq: str = "sq"
sm: str = "sm"
ar: str = "ar"
az: str = "az"
eu: str = "eu"
be: str = "be"
bn: str = "bn"
bh: str = "bh"
bs: str = "bs"
bg: str = "bg"
ca: str = "ca"
zh_CN: str = "zh-CN"
zh_TW: str = "zh-TW"
hr: str = "hr"
cs: str = "cs"
da: str = "da"
nl: str = "nl"
en: str = "en"
eo: str = "eo"
et: str = "et"
fo: str = "fo"
fi: str = "fi"
fr: str = "fr"
fy: str = "fy"
gl: str = "gl"
ka: str = "ka"
de: str = "de"
el: str = "el"
gu: str = "gu"
iw: str = "iw"
hi: str = "hi"
hu: str = "hu"
is_: str = "is"
id: str = "id"
ia: str = "ia"
ga: str = "ga"
it: str = "it"
ja: str = "ja"
jw: str = "jw"
kn: str = "kn"
ko: str = "ko"
la: str = "la"
lv: str = "lv"
lt: str = "lt"
mk: str = "mk"
ms: str = "ms"
ml: str = "ml"
mt: str = "mt"
mr: str = "mr"
ne: str = "ne"
no: str = "no"
nn: str = "nn"
oc: str = "oc"
fa: str = "fa"
pl: str = "pl"
pt_BR: str = "pt-BR"
pt_PT: str = "pt-PT"
pa: str = "pa"
ro: str = "ro"
ru: str = "ru"
gd: str = "gd"
sr: str = "sr"
si: str = "si"
sk: str = "sk"
sl: str = "sl"
es: str = "es"
su: str = "su"
sw: str = "sw"
sv: str = "sv"
tl: str = "tl"
ta: str = "ta"
te: str = "te"
th: str = "th"
ti: str = "ti"
tr: str = "tr"
uk: str = "uk"
ur: str = "ur"
uz: str = "uz"
vi: str = "vi"
cy: str = "cy"
xh: str = "xh"
zu: str = "zu"
Loading

0 comments on commit 8e0eefb

Please sign in to comment.