-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscrap_dc.py
91 lines (74 loc) · 3.53 KB
/
scrap_dc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
'''
Created on Oct 22, 2018
@authors: Lucas May Petry
'''
import scrapy
import psycopg2
import db_settings
from scrapy.http import Request
from datetime import datetime
import re
class DC(scrapy.Spider):
dbname = db_settings.DBNAME
dbhost = db_settings.DBHOST
dbuser = db_settings.DBUSER
dbpass = db_settings.DBPASS
name = 'DIARIO CATARINENSE'
start_urls = ['http://dc.clicrbs.com.br/sc/']
url_base = 'http://dc.clicrbs.com.br/sc/'
debug = True
def __init__(self):
self.conn = psycopg2.connect("dbname='" + self.dbname +
"' user='" + self.dbuser +
"' host='" + self.dbhost +
"' password='" + self.dbpass + "'")
def parse(self, response):
skip = ['Eleições 2018', 'Últimas']
for title in response.css('.nav-item-noticias .subnav .subnav-left ul li'):
next_link = title.xpath('a/@href').extract_first()
subject = title.css("a::text").extract_first()
if subject in skip:
continue
for page in range(1, 150):
req = Request(next_link + "?pagina=" + str(page),
callback=self.parse_topics)
req.meta['subject'] = subject
yield req
def parse_topics(self, response):
for news in response.css('.box-articles article h2'):
news_link = news.xpath('a/@href').extract_first()
req = Request(news_link, callback=self.parse_news)
req.meta['subject'] = response.meta['subject']
yield req
def parse_news(self, response):
tag_re = re.compile(r"<[^>]+>") # Regex to eliminate HTML tags
def extract_date():
date = response.css('.article-header .line-published-date-hour .published-date::text').extract_first()
time = response.css('.article-header .line-published-date-hour .published-hour::text').extract_first()
time = time.replace('h', ':').replace('min', '')
date_time = datetime.strptime(date + ' ' + time, '%d/%m/%Y - %H:%M')
return date_time
subject = response.meta['subject']
title = response.css('.article-header .article-title::text').extract_first().replace("'", "")
subtitle = ''
date_time = extract_date()
author = response.css('.article-body .col-left .contributor a::text').extract_first()
author = 'NULL' if not author else author
text = response.css('.article-body .col-right .entry-content').extract_first().replace("'", "")
text = tag_re.sub("", text)
tags = str(response.css('.article-footer .list-tags li a::text').extract())
link = response.url
portal = 'DIARIO CATARINENSE'
cur = self.conn.cursor()
cur.execute("select count(*) from news where title = '" + title + "' AND subtitle = '" + subtitle + "' AND portal = '" + portal + "'")
if cur.fetchall()[0][0] > 0:
return
query = "insert into news (title, subtitle, date_time, text, authors, portal, tags, subject, link) " + \
"values ('" + title + "', '" + subtitle + "', '" + str(date_time) + "', '" + text + "', '" + author + "', '" + \
portal + "', '" + str(tags).replace("'", '') + "', '" + subject + "', '" + link + "')"
try:
cur.execute(query)
self.conn.commit()
except Exception as e:
print("\n\n\nQuery Error: " + str(e) + "\n\n\n\n")
self.conn.rollback()