-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawlingtools.py
69 lines (51 loc) · 1.56 KB
/
crawlingtools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
from selenium import webdriver
from bs4 import BeautifulSoup
def get_beautifulsoup(url):
"""
Get beautifulsoup for url.
:param url: url to get beautifulsoup
:return: beautifulsoup for url. [beautifulsoup]
"""
driver = None
result = None
try:
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument("--no-sandbox")
driver = webdriver.Chrome('chromedriver', chrome_options=options)
driver.get(url)
result = BeautifulSoup(driver.page_source, 'html.parser')
finally:
driver.quit()
return result
def extract_wod_contents(soup):
"""
Get Tag strings of wod from BeautifulSoup.
:param soup: The BeautifulSoup for crossfit .com wod page.
:return: Tag strings of wod [string]
"""
articles = soup.select('article')
if len(articles) == 0:
return ""
paragrphs = articles[0].select('div > p')
if len(paragrphs) == 0:
return ""
result = '<p><h2>Today\'s workout</h2></p><br>'
for p in paragrphs:
result += str(p)
return result
def extract_date_title(soup):
"""
Get date title string.
:param soup: The BeautifulSoup for crossfit .com wod page.
:return: Title string of wod [string]
"""
tags = soup.select('._day-text_4rxx9_118')
if len(tags) == 0:
return "Not Loaded"
day_text = tags[0].text
tags = soup.select('._wrapper_kbm27_89')
if len(tags) == 0:
return "Not Loaded"
date_text = tags[0].text
return f"{day_text} {date_text}"