From 22445b03af4313b0f64028e076090639b1ce165d Mon Sep 17 00:00:00 2001 From: habenkiros Date: Fri, 14 Jul 2023 21:34:59 +0300 Subject: [PATCH 1/3] generic parsing --- web_scraper/output.jsonl | 0 web_scraper/web_scraper/spiders/generic.py | 27 +++++++++++++------ .../web_scraper/spiders/savantlynet.py | 2 +- 3 files changed, 20 insertions(+), 9 deletions(-) create mode 100644 web_scraper/output.jsonl diff --git a/web_scraper/output.jsonl b/web_scraper/output.jsonl new file mode 100644 index 0000000..e69de29 diff --git a/web_scraper/web_scraper/spiders/generic.py b/web_scraper/web_scraper/spiders/generic.py index 6cfc93b..a07db46 100644 --- a/web_scraper/web_scraper/spiders/generic.py +++ b/web_scraper/web_scraper/spiders/generic.py @@ -3,6 +3,7 @@ from scrapy.linkextractors import LinkExtractor from scrapy.settings import Settings from scrapy.spiders import CrawlSpider, Spider, Rule +from scrapy.exporters import JsonLinesItemExporter from scrapy.http import Response import os from twisted.internet.asyncioreactor import install @@ -22,16 +23,26 @@ class ScrapedPage(scrapy.Item): text = scrapy.Field() links = scrapy.Field() + class GenericSpider(scrapy.Spider): name = "generic" allowed_domains = ["savantly.net"] - start_urls = ["https://savantly.net"] + + custom_settings = { + 'FEED_FORMAT': 'jsonlines', + 'FEED_URI': 'output.jsonl' + } + + def __init__(self, start_url=None, *args, **kwargs): + super(GenericSpider, self).__init__(*args, **kwargs) + if start_url: + self.start_urls = [start_url] def parse(self, response): - item = { - 'url': response.url, - 'status': response.status, - 'text': '\n'.join(response.xpath('//body//text()').getall()), - } - yield item - \ No newline at end of file + scraped_page = ScrapedPage() + scraped_page['url'] = response.url + scraped_page['status'] = response.status + scraped_page['headers'] = response.headers + scraped_page['text'] = '\n'.join(response.xpath('//body//text()').getall()) + scraped_page['links'] = [link.url for link in LinkExtractor(allow_domains=self.allowed_domains).extract_links(response)] + yield scraped_page diff --git a/web_scraper/web_scraper/spiders/savantlynet.py b/web_scraper/web_scraper/spiders/savantlynet.py index ee20d46..9bedcd0 100644 --- a/web_scraper/web_scraper/spiders/savantlynet.py +++ b/web_scraper/web_scraper/spiders/savantlynet.py @@ -118,4 +118,4 @@ def extract_contact_section_data(self, section): data['contact_info'] = extracted_data - return data + return data \ No newline at end of file From 463f93403a6b754f0274760fe71638d9c90488a9 Mon Sep 17 00:00:00 2001 From: habenkiros Date: Fri, 14 Jul 2023 22:21:57 +0300 Subject: [PATCH 2/3] SaveHtmlPipeline added --- requirements.txt | 3 ++- web_scraper/web_scraper/spiders/generic.py | 17 ++++++++++++----- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/requirements.txt b/requirements.txt index bcc49e2..30f5d09 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,4 +16,5 @@ dynaconf pypdf numpy==1.24 wrapt>=1.14,<1.15 -scrapy \ No newline at end of file +scrapy +twisted \ No newline at end of file diff --git a/web_scraper/web_scraper/spiders/generic.py b/web_scraper/web_scraper/spiders/generic.py index a07db46..9378ef2 100644 --- a/web_scraper/web_scraper/spiders/generic.py +++ b/web_scraper/web_scraper/spiders/generic.py @@ -28,16 +28,15 @@ class GenericSpider(scrapy.Spider): name = "generic" allowed_domains = ["savantly.net"] - custom_settings = { - 'FEED_FORMAT': 'jsonlines', - 'FEED_URI': 'output.jsonl' - } - def __init__(self, start_url=None, *args, **kwargs): super(GenericSpider, self).__init__(*args, **kwargs) if start_url: self.start_urls = [start_url] + def start_requests(self): + for url in self.start_urls: + yield scrapy.Request(url, callback=self.parse) + def parse(self, response): scraped_page = ScrapedPage() scraped_page['url'] = response.url @@ -46,3 +45,11 @@ def parse(self, response): scraped_page['text'] = '\n'.join(response.xpath('//body//text()').getall()) scraped_page['links'] = [link.url for link in LinkExtractor(allow_domains=self.allowed_domains).extract_links(response)] yield scraped_page + + +class SaveHtmlPipeline(object): + def process_item(self, item, spider): + filename = f"{DOCS_PATH}/{item['url'].replace('/', '_')}.html" + with open(filename, 'wb') as f: + f.write(item['text'].encode('utf-8')) + return item From 7815601304fa091596af5d40258fa095f8197028 Mon Sep 17 00:00:00 2001 From: habenkiros Date: Sat, 15 Jul 2023 13:02:11 +0300 Subject: [PATCH 3/3] saving html output --- web_scraper/output.jsonl | 0 web_scraper/web_scraper/spiders/generic.py | 22 +++++++++++----------- 2 files changed, 11 insertions(+), 11 deletions(-) delete mode 100644 web_scraper/output.jsonl diff --git a/web_scraper/output.jsonl b/web_scraper/output.jsonl deleted file mode 100644 index e69de29..0000000 diff --git a/web_scraper/web_scraper/spiders/generic.py b/web_scraper/web_scraper/spiders/generic.py index 9378ef2..013015d 100644 --- a/web_scraper/web_scraper/spiders/generic.py +++ b/web_scraper/web_scraper/spiders/generic.py @@ -28,15 +28,17 @@ class GenericSpider(scrapy.Spider): name = "generic" allowed_domains = ["savantly.net"] + custom_settings = { + 'FEED_FORMAT': 'jsonlines', + 'FEED_URI': f"{DOCS_PATH}/output.jsonl", + 'MEDIA_ALLOW_REDIRECTS': True + } + def __init__(self, start_url=None, *args, **kwargs): super(GenericSpider, self).__init__(*args, **kwargs) if start_url: self.start_urls = [start_url] - def start_requests(self): - for url in self.start_urls: - yield scrapy.Request(url, callback=self.parse) - def parse(self, response): scraped_page = ScrapedPage() scraped_page['url'] = response.url @@ -46,10 +48,8 @@ def parse(self, response): scraped_page['links'] = [link.url for link in LinkExtractor(allow_domains=self.allowed_domains).extract_links(response)] yield scraped_page - -class SaveHtmlPipeline(object): - def process_item(self, item, spider): - filename = f"{DOCS_PATH}/{item['url'].replace('/', '_')}.html" - with open(filename, 'wb') as f: - f.write(item['text'].encode('utf-8')) - return item + # Save HTML response to a file + domain_name = response.url.split('//')[-1].split('/')[0] + filename = f"{DOCS_PATH}/{domain_name}.html" + with open(filename, 'wb') as file: + file.write(response.body)