-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstatista.py
105 lines (83 loc) · 3.23 KB
/
statista.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import json
import os
import time
from article import Article
from helper import Scraper
STATISTA_HOME = 'https://www.statista.com'
class Statista(Scraper):
""" Scrape chart of the day infographics"""
def __init__(self, dataPath='data'):
super().__init__()
self.articles = None
self.url = self.getPageUrl()
self.dataPath = f'{os.getcwd()}/{dataPath}'
self.mediaPath = f'{dataPath}/media'
self.dataFilePath = f'{dataPath}/info.json'
self.infoFileDesc = None
@staticmethod
def scrapeArticle(articleUrl):
page = Article(articleUrl)
articleId, article = page.serialize()
return articleId, article
@staticmethod
def getPageUrl(path='chartoftheday', page=1):
return f'{STATISTA_HOME}/{path}/ALL/p/{page}/'
def getGraphicPath(self, articleId):
return f'{self.mediaPath}/{articleId}.jpeg'
def isValidPage(self, soup):
return self.getElement(
soup, 'div', {'class': 'note'}) is not None
def getInfoFileDesc(self):
if not self.infoFileDesc or self.infoFileDesc.closed:
self.infoFileDesc = open(f'{self.dataPath}/info.json', 'w+')
return self.infoFileDesc
def getArticleUrls(self, pageNo):
lis = []
alreadyPresent = 0
pageUrl = self.getPageUrl(page=pageNo)
source, soup = self.getSourceSoup(pageUrl)
articles = self.getAllElements(
soup, 'a', {'class': 'infographicsPanelCard'})
for article in articles:
articleUrl = article.attrs.get('href', None)
if not articleUrl:
continue
articleId = articleUrl.split('/')[2]
graphicFile = self.checkFile(
self.getGraphicPath(articleId))
textFile = self.checkFile(self.dataFilePath)
if not graphicFile or not textFile:
lis.append(f'{STATISTA_HOME}{articleUrl}')
else:
alreadyPresent += 1
return lis, alreadyPresent
def scrapeArticles(self, pageNo):
articles = []
article = None
articleUrls, alreadyPresent = self.getArticleUrls(pageNo)
for articleUrl in articleUrls:
try:
article = self.scrapeArticle(articleUrl)
except Exception as e:
print(f'Exception for {articleUrl}; e = {e}')
if article:
articles.append(article)
return articles, alreadyPresent
def scrape(self, start=1, end=124):
for pageNo in range(start, end + 1):
articles, alreadyPresent = self.scrapeArticles(pageNo)
print(f'Page: {pageNo}, New: {len(articles)}, Already Present: {alreadyPresent}')
obj = {}
try:
with open(f'{self.dataPath}/info.json', 'r') as fileDesc:
data = fileDesc.read()
except Exception as e:
data = {}
if data:
obj = json.loads(data)
for (articleId, article) in articles:
obj[articleId] = article
with open(f'{self.dataPath}/info.json', 'w+') as fileDesc:
json.dump(obj, fileDesc)
self.articles = obj
time.sleep(10)