forked from DataGlacier/VC
-
Notifications
You must be signed in to change notification settings - Fork 0
/
moneycontrol_scrapper.py
52 lines (42 loc) · 1.64 KB
/
moneycontrol_scrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import re
import json
import requests
import datetime
from tqdm import tqdm
from bs4 import BeautifulSoup
from collections import defaultdict
submission = defaultdict(list)
#main url
src_url = 'https://www.moneycontrol.com/news/technical-call-221.html'
#get next page links and call scrap() on each link
def setup(url):
nextlinks = []
src_page = requests.get(url).text
src = BeautifulSoup(src_page, 'lxml')
#ignore <a> with void js as href
anchors = src.find("div", attrs={"class": "pagenation"}).findAll(
'a', {'href': re.compile('^((?!void).)*$')})
nextlinks = [i.attrs['href'] for i in anchors]
for idx, link in enumerate(tqdm(nextlinks)):
scrap('https://www.moneycontrol.com'+link, idx)
#scraps passed page url
def scrap(url, idx):
src_page = requests.get(url).text
src = BeautifulSoup(src_page, 'lxml')
span = src.find("ul", {"id": "cagetory"}).findAll('span')
img = src.find("ul", {"id": "cagetory"}).findAll('img')
#<img> has alt text attr set as heading of news, therefore get img link and heading from same tag
imgs = [i.attrs['src'] for i in img]
titles = [i.attrs['alt'] for i in img]
date = [i.get_text() for i in span]
#list of dicts as values and indexed by page number
submission[str(idx)].append({'title': titles})
submission[str(idx)].append({'date': date})
submission[str(idx)].append({'img_src': imgs})
#save data as json named by current date
def json_dump(data):
date = datetime.date.today().strftime("%B %d, %Y")
with open('moneycontrol_'+str(date)+'.json', 'w') as outfile:
json.dump(submission, outfile)
setup(src_url)
json_dump(submission)