-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathscraper.py
200 lines (159 loc) · 7.17 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
from selenium import webdriver
import time
import pandas as pd
import unidecode
import sqlite3
# Incognito mode
options = webdriver.ChromeOptions()
options.add_argument('--incognito')
# Headless option
options2 = webdriver.ChromeOptions()
options2.add_argument('headless')
# get web driver up and running
driver = webdriver.Chrome(options=options, chrome_options=options2)
driver.get('https://www.glassdoor.com/index.htm')
# window size to make sure javascript parts does not get hidden
driver.set_window_size(1280, 1024)
# To type in job title and location
driver.find_element_by_css_selector('#KeywordSearch').send_keys('Data Scientist')
driver.find_element_by_css_selector('#LocationSearch').clear()
driver.find_element_by_css_selector('#LocationSearch').send_keys('Hong Kong')
driver.find_element_by_css_selector('#HeroSearchButton').click()
# Initializer for the while loop. Will be false once reaches end of page.
end = True
# Initialize columns for data frame before putting together a data frame
# Long term plan is to automate this with Morph.io and pull data using API
cols = ['Title', 'Company', 'Link', 'Rating', 'Job_Description', 'Size', 'Founded', 'Company_Type', 'Industry',
'Revenue', 'CEO', 'Recommend', 'Approve']
df = pd.DataFrame(columns=cols)
while end:
links = driver.find_elements_by_css_selector('#MainCol .flexbox .jobLink')
# j is used to enumerate the links that contains the company names, ratings, and the job links
for j, link in enumerate(links):
time.sleep(2)
link.click()
# Col 1: Job Title
job_titles = link.text
print('Title: ', job_titles)
try:
# to cancel the annoying pop up that tries to prevent scrapers
driver.find_element_by_class_name('mfp-close').click()
except:
pass
# Col 2: Company Name
# Decoding some accents
companies = unidecode.unidecode(link.find_elements_by_xpath('//div[@class="flexbox empLoc"]/div[1]')[j].text.split("–")[0].strip())
print('Company: ', companies)
# Below has issue, those ith HOT or NEW won't be read as posted date
# print('Posted: ',link.find_elements_by_xpath('//span[@class="minor"]')[i].text)
# Col 3: Link to the job
job_links = link.find_elements_by_xpath('//div[@class="flexbox"]/div/a')[j].get_attribute('href')
time.sleep(5)
# Col 4: Ratings
try:
ratings = link.find_element_by_xpath('//span[@class="compactStars margRtSm"]').text
print('Ratings: ', ratings)
except:
ratings = ''
print('Ratings: ', ratings)
pass
# Tab 1: Job description
# Col 5: Job description
try:
descriptions = unidecode.unidecode(link.find_element_by_xpath('//div[@class="jobDescriptionContent desc module pad noMargBot"]').text)
except:
time.sleep(20)
descriptions = unidecode.unidecode(link.find_element_by_xpath('//div[@class="jobDescriptionContent desc module pad noMargBot"]').text)
pass
# Tab 2: Company Tab
# Successfully selected xpath same level based on condition
# https://stackoverflow.com/questions/26963092/selecting-values-in-xpath-depending-on-values-at-same-level
try:
driver.find_element_by_xpath('//li[@data-target = "CompanyContainer"]').click()
# Col 6: Size
sizes = link.find_element_by_xpath('//div[@class = "infoEntity"][label[.] = "Size"]/'
'span[@class = "value"]').text
print('Size: ', sizes)
# Col 7: Founded
founded_years = link.find_element_by_xpath('//div[@class = "infoEntity"][label[.] = "Founded"]/'
'span[@class = "value"]').text
print('Founded: ', founded_years)
# Col 8: Type
types = link.find_element_by_xpath('//div[@class = "infoEntity"][label[.] = "Type"]/'
'span[@class = "value"]').text.replace("Company - ", "")
print('Type: ', types)
# Col 9: Industry
industries = link.find_element_by_xpath('//div[@class = "infoEntity"][label[.] = "Industry"]/'
'span[@class = "value"]').text
print('Industry: ', industries)
# Col 10: Revenue
revenues = link.find_element_by_xpath('//div[@class = "infoEntity"][label[.] = "Revenue"]/'
'span[@class = "value"]').text
print('Revenue: ', revenues)
except:
sizes = ''
print('Size: ', sizes)
founded_years = ''
print('Founded: ', founded_years)
types = ''
print('Type: ', types)
industries = ''
print('Industry: ', industries)
revenues = ''
print('Revenue: ', revenues)
pass
# Tab 3: Rating Tab (only this tab needs try except)
try:
driver.find_element_by_xpath('//li[@data-target = "RatingContainer"]').click()
# Col 11: CEO
# Decoding accents in CEO name
CEOs = unidecode.unidecode(link.find_element_by_xpath('//div[@class = "tbl gfxContainer"]/div[3]/div[@class="tbl"]'
'/div[2]/div[1]').text)
print('CEO: ', CEOs)
# Col 12: Recommend
recommends = link.find_element_by_xpath('//div[@id = "EmpStats_Recommend"]').get_attribute('data-percentage')
print('Recommend: ', recommends)
# Col 13: Approve of CEO
approves = link.find_element_by_xpath('//div[@id = "EmpStats_Approve"]').get_attribute('data-percentage')
print('Approves: ', approves)
print('\n')
except:
CEOs = ''
print('CEO: ', CEOs)
recommends = ''
print('Recommend: ', recommends)
approves = ''
print('Approves: ', approves)
print('\n')
pass
df = df.append({
'Link': job_links,
'Title': job_titles,
'Company': companies,
'Rating': ratings,
'Job_Description': descriptions,
'Size': sizes,
'Founded': founded_years,
'Company_Type': types,
'Industry': industries,
'Revenue': revenues,
'CEO': CEOs,
'Recommend': recommends,
'Approve': approves
}, ignore_index=True)
time.sleep(2)
# To prevent selenium returning stalemate element
# https://stackoverflow.com/questions/45002008/selenium-stale-element-reference-element-is-not-attached-to-the-page
try:
driver.find_element_by_css_selector('.next a').click()
except:
end = False
break
print('Data successfully scraped')
df.to_csv('glassdoor_data.csv', index=False)
conn = sqlite3.connect('data.sqlite')
df.to_sql('data', conn, if_exists='replace')
print('Db successfully constructed and saved')
time.sleep(5)
driver.close()
conn.close()