-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrapy.py
39 lines (32 loc) · 1.29 KB
/
scrapy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import scrapy
import urlparse
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector
class CookspiderSpider(CrawlSpider):
name = "Cookspider"
allowed_domains = ["cook.qld.gov.au"]
start_urls = [
"http://www.cook.qld.gov.au/"
]
rules = (
Rule(LinkExtractor(allow=r'/', deny=(), unique=True),
callback='parse_item', follow=True),
)
def parse_item(self, response):
i = {}
#i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
#i['name'] = response.xpath('//div[@id="name"]').extract()
#i['description'] = response.xpath('//div[@id="description"]').extract()
i['url'] = response.url
sel = response.xpath("//div[@class='portlet-body']")
i['title'] = ''.join(response.xpath('/html/head/title/text()').extract()).strip()
i['text'] = ''.join(sel.xpath("//div[@class='portlet-content']").extract()).strip()
# i['file_urls'] = []
# base_url = response.url
# for a in response.xpath('//a[@href]/@href'):
# link = a.extract()
# if link.endswith('.pdf'):
# i['file_urls'] = (urlparse.urljoin(base_url, link))
return i