-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patharticle.py
105 lines (86 loc) · 2.95 KB
/
article.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import os
import requests
from helper import Scraper
class Article(Scraper):
def __init__(self, url, mediaPath='data/media', dataPath='data'):
super().__init__()
self.graphicPath = None
assert url != '', 'invalid url'
self.url = url
self.dataPath = dataPath
self.mediaPath = mediaPath
self.title = ''
self.body = ''
self.tag = ''
self.publishedDate = ''
self.articleId = None
self.graphicUrl = None
self.bootstrap()
def bootstrap(self):
self.source, self.soup = self.getSourceSoup(self.url)
self.setArticleId()
self.setTitle()
self.setBody()
self.setTag()
self.setPublishedDate()
self.graphicPath = f'{self.mediaPath}/{self.articleId}.jpeg'
self.setGraphic()
def setArticleId(self):
lis = self.url.split('/')
self.articleId = lis[4]
def setTitle(self):
element = self.getElement(
self.soup,
'h1', {'id' : 'infographicArticleTitle'}
)
if element:
self.title = element.getText().strip()
def setBody(self):
element = self.getElement(
self.soup,
'div', {'class': 'article__contentText'}
)
if element:
self.body = element.getText().strip()
def setTag(self):
element = self.getElement(
self.soup,
'div', {'class': 'article__topic'}
)
if element:
self.tag = element.getText().strip()
def setPublishedDate(self):
element = self.getElement(
self.soup,
'time', {'class': 'infographic__date--published'}
)
if element:
self.publishedDate = element.attrs.get('datetime', '').strip()
def setGraphicUrl(self):
element = self.getElement(
self.soup,
'div', {'class': 'article__graphic'}
)
if element:
image = element.find('img')
if image:
self.graphicUrl = image.attrs.get('data-src', '')
def checkMedia(self):
return self.checkFile(self.graphicPath)
def setGraphic(self):
if not self.graphicUrl:
self.setGraphicUrl()
if self.graphicUrl and not self.checkMedia():
data = requests.get(self.graphicUrl).content
os.makedirs(os.path.dirname(self.graphicPath), exist_ok=True)
with open(self.graphicPath, 'wb') as handler:
handler.write(data)
def serialize(self):
mp = {
'title': self.title,
'body': self.body,
'graphic': self.graphicPath,
'publishedDate': self.publishedDate,
'tag': self.tag,
}
return self.articleId, mp