From c3b2b015763fb68ad15962a996803ae601719a41 Mon Sep 17 00:00:00 2001 From: Doomfires Date: Wed, 9 Aug 2017 22:22:24 -0700 Subject: [PATCH 1/2] Commiting small changes and new file --- info.txt | 1 + quotesbot/spiders/toscrape-css-0.py | 22 ++++++++++++++++++++++ quotesbot/spiders/toscrape-css.py | 2 +- 3 files changed, 24 insertions(+), 1 deletion(-) create mode 100644 info.txt create mode 100644 quotesbot/spiders/toscrape-css-0.py diff --git a/info.txt b/info.txt new file mode 100644 index 0000000..eade7ed --- /dev/null +++ b/info.txt @@ -0,0 +1 @@ +This will be my spider army to find all the lyrics to songs. diff --git a/quotesbot/spiders/toscrape-css-0.py b/quotesbot/spiders/toscrape-css-0.py new file mode 100644 index 0000000..900923d --- /dev/null +++ b/quotesbot/spiders/toscrape-css-0.py @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- +import scrapy + + +class ToScrapeCSSSpider(scrapy.Spider): + name = "toscrape-css" + start_urls = [ + 'http://quotes.toscrape.com/', 'http://soundcloud.com/' + ] + + def parse(self, response): + for quote in response.css("div.quote"): + yield { + 'text': quote.css("span.text::text").extract_first(), + 'author': quote.css("small.author::text").extract_first(), + 'tags': quote.css("div.tags > a.tag::text").extract() + } + + next_page_url = response.css("li.next > a::attr(href)").extract_first() + if next_page_url is not None: + yield scrapy.Request(response.urljoin(next_page_url)) + diff --git a/quotesbot/spiders/toscrape-css.py b/quotesbot/spiders/toscrape-css.py index 555e204..900923d 100644 --- a/quotesbot/spiders/toscrape-css.py +++ b/quotesbot/spiders/toscrape-css.py @@ -5,7 +5,7 @@ class ToScrapeCSSSpider(scrapy.Spider): name = "toscrape-css" start_urls = [ - 'http://quotes.toscrape.com/', + 'http://quotes.toscrape.com/', 'http://soundcloud.com/' ] def parse(self, response): From 9327fd1a65a5edc93a6bfcf6353448f25f60858b Mon Sep 17 00:00:00 2001 From: Doomfires Date: Thu, 10 Aug 2017 22:41:02 -0700 Subject: [PATCH 2/2] Commiting updated info and a spider in the works --- .info.txt.swp | Bin 0 -> 12288 bytes info.txt | 2 +- quotesbot/spiders/toscrape-test.py | 11 +++++++++++ 3 files changed, 12 insertions(+), 1 deletion(-) create mode 100644 .info.txt.swp create mode 100644 quotesbot/spiders/toscrape-test.py diff --git a/.info.txt.swp b/.info.txt.swp new file mode 100644 index 0000000000000000000000000000000000000000..91b70126983b3ec1541203d878f45cb0b26d8158 GIT binary patch literal 12288 zcmeI&y-ve05C?DrGarJ51?hmbEuRw1Fja_+A~8CN(-=};aU9xo05S6zyc5rW#6!R( zlm!W~bOHS>MUF4$^Xa!mNwj;}J*T72p^~wp)cf1h)yu}ZdR$V<_#5l$_%EZ$YhPL) zN}ZqZ_^0&yCUNJsFrR|6ZQBiO7hO}Jr1O7#33o3>-k=;Yj*Cr-y w2a?1Tdp*iOk1