-
Notifications
You must be signed in to change notification settings - Fork 5
/
NewsCrawler.py
100 lines (95 loc) · 3.88 KB
/
NewsCrawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import pandas as pd
import os
from newspaper import Article
from tqdm import tqdm
import glob
import Params
def crawl_request(url):
article = Article(url)
article.download()
article.parse()
return article
def news_crawler(tweet_path, tweet_entities_path):
tweets = pd.read_csv(tweet_path)
tweet_entities = pd.read_csv(tweet_entities_path)
tweet_entities.dropna(inplace=True, subset=['ExpandedUrl'])
urls = tweet_entities['ExpandedUrl']
short_urls = tweet_entities['Url']
display_urls = tweet_entities['DisplayUrl']
tweet_ids = tweet_entities['TweetId']
entity_type_codes = tweet_entities['EntityTypeCode']
accepted_news_ids = []
accepted_tweet_ids = []
accepted_user_ids = []
news_articles = []
news_titles = []
publish_date = []
accepted_urls = []
accepted_short_urls = []
accepted_display_urls = []
description = []
source_urls = []
chunk = True
if chunk and not os.path.isdir(f'../data/toy/News'): os.makedirs(f'../data/toy/News')
chunk_size = 20000
indices = urls.index
url_values = urls.values
for i in tqdm(range(len(url_values))):
if entity_type_codes[i] != 2:
continue
url = url_values[i]
ind = indices[i]
if chunk and i % chunk_size == 0 and i > 0:
news = {'NewsId': accepted_news_ids, "UserId": accepted_user_ids, 'TweetId': accepted_tweet_ids,
'ExpandedUrl': accepted_urls, 'ShortUrl': accepted_short_urls, 'DisplayUrl': accepted_display_urls,
'SourceUrl': source_urls, 'Text': news_articles, 'Title': news_titles, 'Description': description,
'PublicationTime': publish_date}
news = pd.DataFrame.from_dict(news)
news.to_csv(f'../data/toy/News/News_Chunk{i//chunk_size}.csv', index=False)
accepted_news_ids = []
accepted_tweet_ids = []
accepted_user_ids = []
accepted_urls = []
accepted_short_urls = []
accepted_display_urls = []
description = []
source_urls = []
news_articles = []
news_titles = []
publish_date = []
try:
if url in accepted_urls:
continue
article = crawl_request(url)
accepted_news_ids.append(ind)
accepted_tweet_ids.append(tweet_ids[ind])
uid = tweets[tweets.Id == tweet_ids[ind]]['UserId']
accepted_user_ids.append(uid)
accepted_short_urls.append(short_urls[ind])
accepted_display_urls.append(display_urls[ind])
accepted_urls.append(url)
source_urls.append(article.source_url)
text = article.text
title = article.title
publish_date.append(article.publish_date)
news_articles.append(text)
news_titles.append(title)
description.append(article.meta_description)
except:
pass
if not chunk:
news = {'NewsId': accepted_news_ids, "UserId": accepted_user_ids, 'TweetId': accepted_tweet_ids,
'ExpandedUrl': accepted_urls, 'ShortUrl': accepted_short_urls, 'DisplayUrl': accepted_display_urls,
'SourceUrl': source_urls, 'Text': news_articles, 'Title': news_titles, 'Description': description,
'PublicationTime': publish_date}
news = pd.DataFrame.from_dict(news)
news.insert(0, "NewsId", list(range(len(news))), True)
news.to_csv(f'{Params.dal["path"]}/News.csv', index=False)
else:
frame_path = sorted(glob.glob(f'{Params.dal["path"]}/News/*_Chunk*.csv'))
frames = []
for f in frame_path:
frames.append(pd.read_csv(f))
news = pd.concat(frames, ignore_index=True)
news.insert(0, "NewsId", list(range(len(news))), True)
news.to_csv(f'{Params.dal["path"]}/News.csv', index=False)