diff --git a/.info.txt.swp b/.info.txt.swp new file mode 100644 index 0000000..91b7012 Binary files /dev/null and b/.info.txt.swp differ diff --git a/info.txt b/info.txt new file mode 100644 index 0000000..77462d5 --- /dev/null +++ b/info.txt @@ -0,0 +1 @@ +This will be my spider army to find all the lyrics to songs. Bad and good words alike! diff --git a/quotesbot/spiders/toscrape-css-0.py b/quotesbot/spiders/toscrape-css-0.py new file mode 100644 index 0000000..900923d --- /dev/null +++ b/quotesbot/spiders/toscrape-css-0.py @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- +import scrapy + + +class ToScrapeCSSSpider(scrapy.Spider): + name = "toscrape-css" + start_urls = [ + 'http://quotes.toscrape.com/', 'http://soundcloud.com/' + ] + + def parse(self, response): + for quote in response.css("div.quote"): + yield { + 'text': quote.css("span.text::text").extract_first(), + 'author': quote.css("small.author::text").extract_first(), + 'tags': quote.css("div.tags > a.tag::text").extract() + } + + next_page_url = response.css("li.next > a::attr(href)").extract_first() + if next_page_url is not None: + yield scrapy.Request(response.urljoin(next_page_url)) + diff --git a/quotesbot/spiders/toscrape-css.py b/quotesbot/spiders/toscrape-css.py index 555e204..900923d 100644 --- a/quotesbot/spiders/toscrape-css.py +++ b/quotesbot/spiders/toscrape-css.py @@ -5,7 +5,7 @@ class ToScrapeCSSSpider(scrapy.Spider): name = "toscrape-css" start_urls = [ - 'http://quotes.toscrape.com/', + 'http://quotes.toscrape.com/', 'http://soundcloud.com/' ] def parse(self, response): diff --git a/quotesbot/spiders/toscrape-test.py b/quotesbot/spiders/toscrape-test.py new file mode 100644 index 0000000..22a90f6 --- /dev/null +++ b/quotesbot/spiders/toscrape-test.py @@ -0,0 +1,11 @@ +# -*- coding: utf-8 -*- +import scrapy + +class ToScrapeHTMLSpider(scrapy.Spider): + name = "toscrape-html" + start_urls = [ + 'http://quotes.toscrape.com/' +] + + def parse(self, response): + for quote in response.html("