Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Books #6

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 62 additions & 0 deletions quotesbot/spiders/ScrapySpider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# import sys
# print(sys.version)
# 路徑:C:\user\Anaconda3\envs\scrapyTest\Lib\site-packages\scrapy\booksDemo
import sys
import io
import scrapy
from urllib import parse as urlparse
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf8')

class booksSpider(scrapy.Spider):
name = "books"
allowed_domains = ["books.com.tw"]
start_urls = [
"http://activity.books.com.tw/everylettermatters/sentence/latest"
]

# def parse(self, response):
# filename = response.url.split("/")[-2]
# with open(filename, 'wb') as f:
# f.write(response.body)

def parse(self, response):
# learn_nodes = response.css('div.item')
learn_nodes = response.xpath('//div[@class="item"]')
for learn_node in learn_nodes:
yield{
'content:' : "".join(str(learn_node.css('h5 > a::text').extract_first()).split()) ,
'source:' : learn_node.css('p.source-book>span.link>a::text').extract_first(),
'from :' : str(learn_node.css('p.source-book>span.link:nth-child(2) >a::text').extract_first())
# '來源:' : learn_node.css('p.source-book>span.link>a::text').extract_first(),
# '作者:' : learn_node.css('p.source-book>span.link:nth-child(2) >a::text').extract_first()
}


# class booksSpider(scrapy.Spider):
# name = "quotes"
# allowed_domains = ["quotes.toscrape.com"]
# start_urls = [
# "http://quotes.toscrape.com/"
# ]
#
# # def parse(self, response):
# # filename = response.url.split("/")[-2]
# # with open(filename, 'wb') as f:
# # f.write(response.body)
#
# # def parse(self, response):
# # # learn_nodes = response.css('div.item')
# # learn_nodes = response.xpath('//div[@class="quote"]')
# # for learn_node in learn_nodes:
# # yield{
# # '英文名言' : learn_node.css('span.text::text').extract_first()
# # }
# def parse(self, response):
# #使用 css 選擇要素進行抓取,如果喜歡用BeautifulSoup之類的也可以
# #先定位一整塊的quote,在這個網頁塊下進行作者、名言,標籤的抓取
# for quote in response.css('.quote'):
# yield {
# '作者' : quote.css('small.author::text').extract_first(),
# 'tags' : quote.css('div.tags a.tag::text').extract(),
# '英文名言' : quote.css('span.text::text').extract_first()
# }