-
Notifications
You must be signed in to change notification settings - Fork 1
/
ScrapNews.py
56 lines (46 loc) · 1.05 KB
/
ScrapNews.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# -*- coding: utf-8 -*-
"""
Created on Fri Jan 18 22:21:26 2019
@author: prakhar
"""
'''
This script extracts all the current news headline articles present on the site of The Hindu Newspaper.
'''
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup
url="https://www.thehindu.com"
html=urlopen(url)
soup = BeautifulSoup(html, 'lxml')
hyps=soup.find_all('a')
l=[]
for link in hyps:
a=str(link.get('href'))
for i in range(len(a)):
if a[i:i+4]=='.ece':
l.append(a[24:])
break
head=[]
for i in l:
n=len(i)
for j in range(len(i)):
if i[n-1-j]=='/':
a=j
break
for j in range(a+1,len(i)):
if i[n-1-j]=='/':
b=j
break
head.append(i[n-b:n-1-a])
nhead=[]
for word in head:
sen=word.split('-')
temp=''
for i in range(len(sen)):
temp+=sen[i]
temp+=' '
nhead.append(temp)
nset=set(nhead)
nlist=list(nset)
data=pd.Series(data=nlist,index=None,dtype=str)
print(data)