-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape.py
99 lines (73 loc) · 2.48 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
from lxml import html
import requests
from text import clean_text
from db import get_db
import json
import re
import requests_cache
requests_cache.install_cache()
db = get_db()
cursor = db.cursor()
cursor.execute ("SELECT VERSION()")
row = cursor.fetchone ()
print "server version:", row[0]
cursor.execute('SET NAMES utf8;')
cursor.execute('SET CHARACTER SET utf8;')
cursor.execute('SET character_set_connection=utf8;')
def insert(text, source, fake):
text = clean_text(text.strip())
if len(text) > 1000:
print ">>> ", text[0:200]
cursor.execute("INSERT INTO news (text, source, fake) VALUES (%s, %s, %s)", (
text,
source.encode('utf-8'),
1 if fake else 0
))
else:
print "--- (skipped)"
def scrape_titanic():
fake_sources = [
]
for x in range(1, 1000):
fake_sources.append({
'url': "http://www.titanic-magazin.de/newsticker/seite/{}/".format(x),
'text_selector': ".tt_news-bodytext"
})
for s in fake_sources:
page = requests.get(s['url'])
tree = html.fromstring(page.content)
texts = tree.cssselect(s['text_selector'])
for t in texts:
txt = t.text_content()
insert(txt, "titanic", True)
def scrape_br24():
# TODO: remove "Quelle"
for x in range(1, 12):
doc = requests.get("https://br24-backend-hackathon.br.de/api/v4/news?limit=1000&page={}".format(x))
j = json.loads(doc.content)['data']
for article in j:
if article['articleType'] == "news_text":
insert(article['text'], "br24", False)
def scrape_postillion():
urls = set()
for y in range(2015, 2017):
for m in range(1, 13):
page = requests.get("http://www.der-postillon.com/search?updated-max={}-{}-01T00:00:00%2B00:00&max-results=50".format(y, m))
tree = html.fromstring(page.content)
links = tree.cssselect(".post-title.entry-title a")
for l in links:
url = l.attrib['href']
if len(url) > 0:
urls.add(url)
for url in urls:
page = requests.get(url)
tree = html.fromstring(page.content)
text = tree.cssselect('.post-body')[0]
text = re.sub('^.*\(dpo\) - ', '', text.text_content().strip())
insert(text, "postillion", 1)
scrape_postillion()
scrape_br24()
scrape_titanic()
# commit your changes
db.commit()
#joblib.dump(fake_news, 'fake_news.pkl')