diff --git a/tests/test_job_posting.py b/tests/test_job_posting.py index f5b759e..c972e0a 100644 --- a/tests/test_job_posting.py +++ b/tests/test_job_posting.py @@ -7,12 +7,7 @@ from pydantic import ValidationError from scrapy_poet import DummyResponse from scrapy_spider_metadata import get_spider_metadata -from zyte_common_items import ( - JobPosting, - JobPostingNavigation, - ProbabilityRequest, - Request, -) +from zyte_common_items import JobPosting, JobPostingNavigation, ProbabilityRequest from zyte_spider_templates._geolocations import ( GEOLOCATION_OPTIONS, @@ -472,7 +467,7 @@ def test_get_nextpage_request(): url = "https://example.com" # Minimal Args - request = Request(url) + request = ProbabilityRequest(url=url) spider = JobPostingSpider(url="https://example.com") parse_navigation = lambda _: None spider.parse_navigation = parse_navigation # type: ignore @@ -490,7 +485,7 @@ def test_get_parse_navigation_request(): url = "https://example.com" # Minimal args - request = Request(url) + request = ProbabilityRequest(url=url) spider = JobPostingSpider(url="https://example.com") parse_navigation = lambda _: None spider.parse_navigation = parse_navigation # type: ignore diff --git a/zyte_spider_templates/spiders/job_posting.py b/zyte_spider_templates/spiders/job_posting.py index 0163a29..14697a7 100644 --- a/zyte_spider_templates/spiders/job_posting.py +++ b/zyte_spider_templates/spiders/job_posting.py @@ -1,11 +1,11 @@ from __future__ import annotations from enum import Enum -from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, Optional, Union +from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, Optional, cast import requests +import scrapy from pydantic import BaseModel, ConfigDict, Field -from scrapy import Request from scrapy.crawler import Crawler from scrapy_poet import DummyResponse from scrapy_spider_metadata import Args @@ -154,19 +154,19 @@ def get_start_request(self, url): else "jobPostingNavigation" }, } - return Request( + return scrapy.Request( url=url, callback=callback, meta=meta, ) - def start_requests(self) -> Iterable[Request]: + def start_requests(self) -> Iterable[scrapy.Request]: for url in self.start_urls: yield self.get_start_request(url) def parse_navigation( self, response: DummyResponse, navigation: JobPostingNavigation - ) -> Iterable[Request]: + ) -> Iterable[scrapy.Request]: job_postings = navigation.items or [] for request in job_postings: yield self.get_parse_job_posting_request(request) @@ -178,7 +178,9 @@ def parse_navigation( f"are no job posting links found in {navigation.url}" ) else: - yield self.get_nextpage_request(navigation.nextPage) + yield self.get_nextpage_request( + cast(ProbabilityRequest, navigation.nextPage) + ) def parse_job_posting( self, response: DummyResponse, job_posting: JobPosting @@ -189,6 +191,7 @@ def parse_job_posting( if probability is None or probability >= 0.1: yield job_posting else: + assert self.crawler.stats self.crawler.stats.inc_value("drop_item/job_posting/low_probability") self.logger.info( f"Ignoring item from {response.url} since its probability is " @@ -197,11 +200,11 @@ def parse_job_posting( def get_parse_navigation_request( self, - request: Union[ProbabilityRequest, Request], + request: ProbabilityRequest, callback: Optional[Callable] = None, page_params: Optional[Dict[str, Any]] = None, page_type: str = "jobPostingNavigation", - ) -> Request: + ) -> scrapy.Request: callback = callback or self.parse_navigation return request.to_scrapy( @@ -218,7 +221,7 @@ def get_parse_navigation_request( def get_nextpage_request( self, - request: Union[ProbabilityRequest, Request], + request: ProbabilityRequest, callback: Optional[Callable] = None, page_params: Optional[Dict[str, Any]] = None, ): @@ -228,7 +231,7 @@ def get_nextpage_request( def get_parse_job_posting_request( self, request: ProbabilityRequest, callback: Optional[Callable] = None - ) -> Request: + ) -> scrapy.Request: callback = callback or self.parse_job_posting probability = request.get_probability()