-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscrap_g1.py
128 lines (107 loc) · 4.84 KB
/
scrap_g1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
'''
Created on 22, Oct, 2018
@authors: Camila Leite, Vinicius Freitas, Lucas May
'''
import scrapy
import psycopg2
import db_settings
from scrapy.http import Request
from datetime import datetime
import re
class GloboSpider(scrapy.Spider):
dbname = db_settings.DBNAME
dbhost = db_settings.DBHOST
dbuser = db_settings.DBUSER
dbpass = db_settings.DBPASS
debug = False
name = 'globo'
start_urls = ['https://g1.globo.com/sc/santa-catarina']
stop = False
def __init__(self):
self.conn = psycopg2.connect("dbname='" + self.dbname +
"' user='" + self.dbuser +
"' host='" + self.dbhost +
"' password='" + self.dbpass + "'")
def parse(self, response):
if self.stop:
return
url_base = "https://g1.globo.com/sc/santa-catarina"
pg = response.meta['page'] if 'page' in response.meta else 1
if pg > 200:
return
for title in response.css('.feed-post-body a'):
next_link = title.xpath('@href').extract_first()
if not next_link:
continue
yield Request(next_link, callback=self.parse_news)
next_link = url_base+"/index/feed/pagina-" + str(pg) + ".ghtml"
req = Request(next_link, callback=self.parse)
req.meta['page'] = pg+1
yield req
def parse_news(self, response):
if self.stop:
return
text_re = re.compile(r"<[^>]+>") # Regex to eliminate HTML tags
if self.debug:
print("----- ENTERING NEWS PAGE -----")
def extract_date():
date_time = response.xpath('//time/text()').extract_first().replace("h", ':')
date_time = date_time[1:-1]
if self.debug:
print(date_time)
date_time = datetime.strptime(date_time, '%d/%m/%Y %H:%M')
return date_time
def extract_sub_and_title():
title = response.xpath("//*[contains(@class, 'content-head__title')]/text()").extract_first()
# title = text_re.sub("", title)
subtitle = response.xpath("//*[contains(@class, 'content-head__subtitle')]/text()").extract_first() # This website contains no subtitles
if self.debug:
print(title)
print(subtitle)
return {title, subtitle}
def extract_text():
get_full_text = response.xpath("//*[contains(@class, 'content-text__container')]").extract() # Takes the HTML of the <p> element of class StyledParagraph. This is the class of all paragraphs in news article inside the HTML page.
text = "" # Create the base appendable text
for p in get_full_text:
text_part = text_re.sub("", p) # Eliminate all HTML tags from text
text += text_part + " "
if self.debug:
print("--- FINAL NEWS TEXT EXTRACTED ---")
print(text) # Show final text
return text
def extract_tags():
get_full_tags = response.xpath("//*[@class='entities__list']")
get_full_tags = get_full_tags.css('a ::text').extract()
tags = str(get_full_tags).replace("'", "")
if self.debug:
print(tags)
return tags
def extract_subject():
return "Santa Catarina"
#G1 HAS NO AUTHORSHIP
def extract_author():
return ""
def commit_to_db(date, title, subtitle, text, tag, subject, author, link, portal):
cur = self.conn.cursor()
cur.execute("select count(*) from news where title = $title$" + title + "$title$ AND subtitle = $subtitle$" + subtitle + "$subtitle$ AND portal = $portal$" + portal + "$portal$")
if cur.fetchall()[0][0] > 0:
return
query = "insert into news (title, subtitle, date_time, text, authors, portal, tags, subject, link) " + \
"values ($title$" + title + "$title$, $subtitle$" + subtitle + "$subtitle$, $date$" + str(date) + "$date$, $text$" + text + "$text$, $author$" + author + "$author$, $portal$" + \
portal + "$portal$, $tag$" + tag + "$tag$, $subject$" + subject + "$subject$, $link$" + link + "$link$)"
try:
cur.execute(query)
self.conn.commit()
except Exception as e:
print("\n\n\n\n\n\n\n\n\nQuery Error: " + str(e) + "\n\n\n\n\n\n\n\n\n\n")
self.conn.rollback()
self.stop = True
date = extract_date()
title, subtitle = extract_sub_and_title()
text = extract_text()
tag = extract_tags()
subject = extract_subject()
author = extract_author()
link = response.url
portal = "Globo G1"
commit_to_db(date, title, subtitle, text, tag, subject, author, link, portal)