diff --git a/tests/test_serp.py b/tests/test_serp.py index cd1a549..4de4e29 100644 --- a/tests/test_serp.py +++ b/tests/test_serp.py @@ -17,6 +17,11 @@ GOOGLE_GL_OPTIONS_WITH_CODE, GoogleGl, ) +from zyte_spider_templates.spiders._google_hl import ( + GOOGLE_HL_OPTIONS, + GOOGLE_HL_OPTIONS_WITH_CODE, + GoogleHl, +) from zyte_spider_templates.spiders.serp import ( ITEM_TYPE_CLASSES, GoogleSearchSpider, @@ -393,6 +398,41 @@ def test_metadata(): sorted(GEOLOCATION_OPTIONS, key=GEOLOCATION_OPTIONS.__getitem__) ), }, + "hl": { + "anyOf": [ + {"type": "string"}, + {"type": "null"}, + ], + "default": None, + "description": ( + "User interface language, which can affect search " + "results. See " + "https://developers.google.com/custom-search/v1/reference/rest/v1/cse/list#body.QUERY_PARAMETERS.hl" + ), + "enumMeta": { + code: { + "title": GOOGLE_HL_OPTIONS_WITH_CODE[code], + } + for code in sorted(GoogleHl) + }, + "title": "User Language", + "enum": list( + sorted(GOOGLE_HL_OPTIONS, key=GOOGLE_HL_OPTIONS.__getitem__) + ), + }, + "lr": { + "anyOf": [ + {"type": "string"}, + {"type": "null"}, + ], + "default": None, + "description": ( + "Restricts search results to documents written in the " + "specified languages. See " + "https://developers.google.com/custom-search/v1/reference/rest/v1/cse/list#body.QUERY_PARAMETERS.lr" + ), + "title": "Content Languages", + }, }, "required": ["search_queries"], "title": "GoogleSearchSpiderParams", @@ -576,6 +616,36 @@ def test_parse_serp(): spider.parse_serp(response) # type: ignore[call-arg] +def test_hl(): + crawler = get_crawler() + spider = GoogleSearchSpider.from_crawler( + crawler, search_queries="foo", hl="gl", max_pages=2 + ) + requests = list(spider.start_requests()) + assert len(requests) == 1 + assert requests[0].url == "https://www.google.com/search?q=foo&hl=gl" + + items, requests = run_parse_serp(spider) + assert len(items) == 1 + assert len(requests) == 1 + assert requests[0].url == "https://www.google.com/search?q=foo&start=10&hl=gl" + + +def test_lr(): + crawler = get_crawler() + spider = GoogleSearchSpider.from_crawler( + crawler, search_queries="foo", lr="lang_ja", max_pages=2 + ) + requests = list(spider.start_requests()) + assert len(requests) == 1 + assert requests[0].url == "https://www.google.com/search?q=foo&lr=lang_ja" + + items, requests = run_parse_serp(spider) + assert len(items) == 1 + assert len(requests) == 1 + assert requests[0].url == "https://www.google.com/search?q=foo&start=10&lr=lang_ja" + + def test_cr(): crawler = get_crawler() spider = GoogleSearchSpider.from_crawler( diff --git a/utils/google-hl-updater/requirements.in b/utils/google-hl-updater/requirements.in new file mode 100644 index 0000000..25d38c0 --- /dev/null +++ b/utils/google-hl-updater/requirements.in @@ -0,0 +1,3 @@ +jinja2 +parsel +requests diff --git a/utils/google-hl-updater/requirements.txt b/utils/google-hl-updater/requirements.txt new file mode 100644 index 0000000..93b80f5 --- /dev/null +++ b/utils/google-hl-updater/requirements.txt @@ -0,0 +1,32 @@ +# +# This file is autogenerated by pip-compile with Python 3.12 +# by the following command: +# +# pip-compile +# +certifi==2024.8.30 + # via requests +charset-normalizer==3.4.0 + # via requests +cssselect==1.2.0 + # via parsel +idna==3.10 + # via requests +jinja2==3.1.4 + # via -r requirements.in +jmespath==1.0.1 + # via parsel +lxml==5.3.0 + # via parsel +markupsafe==3.0.2 + # via jinja2 +packaging==24.2 + # via parsel +parsel==1.9.1 + # via -r requirements.in +requests==2.32.3 + # via -r requirements.in +urllib3==2.2.3 + # via requests +w3lib==2.2.1 + # via parsel diff --git a/utils/google-hl-updater/template.py b/utils/google-hl-updater/template.py new file mode 100644 index 0000000..1d98480 --- /dev/null +++ b/utils/google-hl-updater/template.py @@ -0,0 +1,18 @@ +{% raw %}# _google_gl.py counterpart for +# https://developers.google.com/custom-search/docs/json_api_reference#interfaceLanguages +# +# Built automatically with ../../utils/google-hl-updater + +from enum import Enum + +GOOGLE_HL_OPTIONS = {{% endraw %}{% for language in languages %} + "{{ language.code }}": "{{ language.name }}",{% endfor %}{% raw %} +} +GOOGLE_HL_OPTIONS_WITH_CODE = { + code: f"{name} ({code})" for code, name in GOOGLE_HL_OPTIONS.items() +} + + +class GoogleHl(str, Enum):{% endraw %}{% for language in languages %} + {{ language.keyword }}: str = "{{ language.code }}"{% endfor %} + diff --git a/utils/google-hl-updater/update.py b/utils/google-hl-updater/update.py new file mode 100644 index 0000000..fad198b --- /dev/null +++ b/utils/google-hl-updater/update.py @@ -0,0 +1,38 @@ +from keyword import iskeyword +from pathlib import Path + +import jinja2 +import requests +from parsel import Selector + +languages = [] + +response = requests.get( + "https://developers.google.com/custom-search/docs/json_api_reference" +) +selector = Selector(text=response.text) +table = selector.xpath( + '//*[@id="supported-interface-languages"]/following-sibling::table[1]' +) +for tr in table.css("tr"): + name = tr.xpath("td/text()").get() + if not name: # header + continue + code = tr.xpath("td/span/text()").get() + keyword = f"{code}_" if iskeyword(code) else code + keyword = keyword.replace("-", "_") + languages.append({"code": code, "keyword": keyword, "name": name}) + +template_path = Path(__file__).parent / "template.py" +template_environment = jinja2.Environment() +with template_path.open() as f: + template = template_environment.from_string(f.read()) +output = template.render(languages=languages) +output_path = ( + Path(__file__).parent.parent.parent + / "zyte_spider_templates" + / "spiders" + / "_google_hl.py" +) +with output_path.open("w") as f: + f.write(output) diff --git a/zyte_spider_templates/spiders/_google_hl.py b/zyte_spider_templates/spiders/_google_hl.py new file mode 100644 index 0000000..b0de159 --- /dev/null +++ b/zyte_spider_templates/spiders/_google_hl.py @@ -0,0 +1,181 @@ +# _google_gl.py counterpart for +# https://developers.google.com/custom-search/docs/json_api_reference#interfaceLanguages +# +# Built automatically with ../../utils/google-hl-updater + +from enum import Enum + +GOOGLE_HL_OPTIONS = { + "af": "Afrikaans", + "sq": "Albanian", + "sm": "Amharic", + "ar": "Arabic", + "az": "Azerbaijani", + "eu": "Basque", + "be": "Belarusian", + "bn": "Bengali", + "bh": "Bihari", + "bs": "Bosnian", + "bg": "Bulgarian", + "ca": "Catalan", + "zh-CN": "Chinese (Simplified)", + "zh-TW": "Chinese (Traditional)", + "hr": "Croatian", + "cs": "Czech", + "da": "Danish", + "nl": "Dutch", + "en": "English", + "eo": "Esperanto", + "et": "Estonian", + "fo": "Faroese", + "fi": "Finnish", + "fr": "French", + "fy": "Frisian", + "gl": "Galician", + "ka": "Georgian", + "de": "German", + "el": "Greek", + "gu": "Gujarati", + "iw": "Hebrew", + "hi": "Hindi", + "hu": "Hungarian", + "is": "Icelandic", + "id": "Indonesian", + "ia": "Interlingua", + "ga": "Irish", + "it": "Italian", + "ja": "Japanese", + "jw": "Javanese", + "kn": "Kannada", + "ko": "Korean", + "la": "Latin", + "lv": "Latvian", + "lt": "Lithuanian", + "mk": "Macedonian", + "ms": "Malay", + "ml": "Malayam", + "mt": "Maltese", + "mr": "Marathi", + "ne": "Nepali", + "no": "Norwegian", + "nn": "Norwegian (Nynorsk)", + "oc": "Occitan", + "fa": "Persian", + "pl": "Polish", + "pt-BR": "Portuguese (Brazil)", + "pt-PT": "Portuguese (Portugal)", + "pa": "Punjabi", + "ro": "Romanian", + "ru": "Russian", + "gd": "Scots Gaelic", + "sr": "Serbian", + "si": "Sinhalese", + "sk": "Slovak", + "sl": "Slovenian", + "es": "Spanish", + "su": "Sudanese", + "sw": "Swahili", + "sv": "Swedish", + "tl": "Tagalog", + "ta": "Tamil", + "te": "Telugu", + "th": "Thai", + "ti": "Tigrinya", + "tr": "Turkish", + "uk": "Ukrainian", + "ur": "Urdu", + "uz": "Uzbek", + "vi": "Vietnamese", + "cy": "Welsh", + "xh": "Xhosa", + "zu": "Zulu", +} +GOOGLE_HL_OPTIONS_WITH_CODE = { + code: f"{name} ({code})" for code, name in GOOGLE_HL_OPTIONS.items() +} + + +class GoogleHl(str, Enum): + af: str = "af" + sq: str = "sq" + sm: str = "sm" + ar: str = "ar" + az: str = "az" + eu: str = "eu" + be: str = "be" + bn: str = "bn" + bh: str = "bh" + bs: str = "bs" + bg: str = "bg" + ca: str = "ca" + zh_CN: str = "zh-CN" + zh_TW: str = "zh-TW" + hr: str = "hr" + cs: str = "cs" + da: str = "da" + nl: str = "nl" + en: str = "en" + eo: str = "eo" + et: str = "et" + fo: str = "fo" + fi: str = "fi" + fr: str = "fr" + fy: str = "fy" + gl: str = "gl" + ka: str = "ka" + de: str = "de" + el: str = "el" + gu: str = "gu" + iw: str = "iw" + hi: str = "hi" + hu: str = "hu" + is_: str = "is" + id: str = "id" + ia: str = "ia" + ga: str = "ga" + it: str = "it" + ja: str = "ja" + jw: str = "jw" + kn: str = "kn" + ko: str = "ko" + la: str = "la" + lv: str = "lv" + lt: str = "lt" + mk: str = "mk" + ms: str = "ms" + ml: str = "ml" + mt: str = "mt" + mr: str = "mr" + ne: str = "ne" + no: str = "no" + nn: str = "nn" + oc: str = "oc" + fa: str = "fa" + pl: str = "pl" + pt_BR: str = "pt-BR" + pt_PT: str = "pt-PT" + pa: str = "pa" + ro: str = "ro" + ru: str = "ru" + gd: str = "gd" + sr: str = "sr" + si: str = "si" + sk: str = "sk" + sl: str = "sl" + es: str = "es" + su: str = "su" + sw: str = "sw" + sv: str = "sv" + tl: str = "tl" + ta: str = "ta" + te: str = "te" + th: str = "th" + ti: str = "ti" + tr: str = "tr" + uk: str = "uk" + ur: str = "ur" + uz: str = "uz" + vi: str = "vi" + cy: str = "cy" + xh: str = "xh" + zu: str = "zu" diff --git a/zyte_spider_templates/spiders/serp.py b/zyte_spider_templates/spiders/serp.py index f83deb8..3106682 100644 --- a/zyte_spider_templates/spiders/serp.py +++ b/zyte_spider_templates/spiders/serp.py @@ -22,6 +22,7 @@ from ..params import MaxRequestsParam from ._google_domains import GoogleDomain from ._google_gl import GOOGLE_GL_OPTIONS_WITH_CODE, GoogleGl +from ._google_hl import GOOGLE_HL_OPTIONS_WITH_CODE, GoogleHl from .base import BaseSpider @@ -56,6 +57,37 @@ class GoogleGlParam(BaseModel): ) +class GoogleHlParam(BaseModel): + hl: Optional[GoogleHl] = Field( + title="User Language", + description=( + "User interface language, which can affect search results. See " + "https://developers.google.com/custom-search/v1/reference/rest/v1/cse/list#body.QUERY_PARAMETERS.hl" + ), + default=None, + json_schema_extra={ + "enumMeta": { + code: { + "title": GOOGLE_HL_OPTIONS_WITH_CODE[code], + } + for code in GoogleHl + } + }, + ) + + +class GoogleLrParam(BaseModel): + lr: Optional[str] = Field( + title="Content Languages", + description=( + "Restricts search results to documents written in the specified " + "languages. See " + "https://developers.google.com/custom-search/v1/reference/rest/v1/cse/list#body.QUERY_PARAMETERS.lr" + ), + default=None, + ) + + class SearchQueriesParam(BaseModel): search_queries: Optional[List[str]] = Field( title="Search Queries", @@ -186,6 +218,8 @@ class GoogleDomainParam(BaseModel): class GoogleSearchSpiderParams( + GoogleLrParam, + GoogleHlParam, SerpGeolocationParam, GoogleCrParam, GoogleGlParam, @@ -233,12 +267,20 @@ def update_settings(cls, settings: BaseSettings) -> None: ) def get_serp_request(self, url: str, *, page_number: int): - if self.args.cr: - url = add_or_replace_parameter(url, "cr", self.args.cr) - if self.args.gl: - url = add_or_replace_parameter(url, "gl", self.args.gl.value) - if self.args.results_per_page: - url = add_or_replace_parameter(url, "num", str(self.args.results_per_page)) + for argument, parameter in ( + (self.args.cr, "cr"), + (self.args.gl, "gl"), + (self.args.hl, "hl"), + (self.args.lr, "lr"), + (self.args.results_per_page, "num"), + ): + if not argument: + continue + if isinstance(argument, Enum): + argument = argument.value + if not isinstance(argument, str): + argument = str(argument) + url = add_or_replace_parameter(url, parameter, argument) return Request( url=url, callback=self.parse_serp,