-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
143 lines (129 loc) · 5.65 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import pandas as pd
import time
username='email'
password='password'
job_keyword="Data Scientist"
# Define options to make Selenium lightweight
options = webdriver.ChromeOptions()
options.add_argument("--ignore-certificate-errors")
options.add_argument('--no-sandbox')
options.add_argument("--disable-extensions")
options.add_argument("--disable-blink-features")
options.add_argument("--disable-blink-features=AutomationControlled")
driver = webdriver.Chrome(options=options)
driver.get('https://www.linkedin.com/login?trk=guest_homepage-basic_nav-header-signin')
user_field = driver.find_element("id","username")
pw_field = driver.find_element("id","password")
login_button = driver.find_element("xpath",
'//*[@id="organic-div"]/form/div[3]/button')
user_field.send_keys(username)
user_field.send_keys(Keys.TAB)
time.sleep(2)
pw_field.send_keys(password)
time.sleep(2)
login_button.click()
time.sleep(6)
#minimize=driver.find_element("id","ember150")
#minimize.click()
search_button = driver.find_element(By.CLASS_NAME,"search-global-typeahead")
# Use ActionChains to perform a right-click on the search button element and choose "Inspect"
actions = ActionChains(driver)
actions.click(search_button).perform()
actions.send_keys(job_keyword).perform()
actions.send_keys(Keys.ENTER).perform()
time.sleep(6)
# Wait for the search bar to load and type "data scientist" into it
jobsicon = driver.find_element(By.CSS_SELECTOR, "button[class*='search-reusables__']")
jobsicon.click()
time.sleep(6)
scrollresults = driver.find_element(By.CLASS_NAME,
"jobs-search-results-list"
)
for i in range(300, 5000, 100):
driver.execute_script("arguments[0].scrollTo(0, {})".format(i), scrollresults)
time.sleep(0.2)
links = driver.find_elements("xpath",
'//div[@data-job-id]'
)
jobs_df = pd.DataFrame(columns=['Job Title', 'Location', 'Applicant Count', 'Job Poster', 'Requirements', 'Job URL'])
page_num=2
while True:
links = driver.find_elements("xpath",'//div[@data-job-id]')
for i in range(len(links)):
try:
link=links[i]
link.click()
time.sleep(1)
if f"{job_keyword}" not in driver.title:
driver.back()
time.sleep(4)
links = driver.find_elements("xpath",'//div[@data-job-id]')
else:
pass
link=links[i]
html_source = link.get_attribute('innerHTML')
# create a BeautifulSoup object from the HTML source
soup = BeautifulSoup(html_source, 'html.parser')
# extract the job title, company name, and location from the soup object
try:
job_title = soup.find('a', {'class': 'disabled ember-view job-card-container__link job-card-list__title'}).text.strip()
except AttributeError:
job_title = None
try:
element = driver.find_element(By.CSS_SELECTOR, f'div[aria-label="{job_title}"][data-job-details-events-trigger=""]')
soup = BeautifulSoup(element.get_attribute('outerHTML'), 'html.parser')
location = soup.find('span', {'class': 'jobs-unified-top-card__bullet'}).text.strip()
except:
location = None
try:
applicant_count = int(soup.find('span', {'class': 'jobs-unified-top-card__applicant-count'}).text.strip().split()[0])
except (AttributeError, ValueError):
applicant_count = None
try:
job_poster = soup.find('span', {'class': 'jobs-poster__name'}).strong.text.strip()
except AttributeError:
job_poster = None
try:
requirement_tags = soup.find('div', {'id': 'job-details'}).find_all('li')
requirements = [tag.text.strip() for tag in requirement_tags if tag.text.strip()]
for tag in requirement_tags:
if not tag.text.strip():
tag.decompose()
else:
tag.contents[0].replace_with(tag.text.strip())
except AttributeError:
requirements = None
try:
job_url = "https://www.linkedin.com/" + soup.find('a')['href']
except (AttributeError, TypeError):
job_url = None
jobs_df.loc[jobs_df.shape[0]] = {
'Job Title': job_title,
'Location': location,
'Applicant Count': applicant_count,
'Job Poster': job_poster,
'Requirements': requirements,
'Job URL': job_url
}
time.sleep(1)
except: print('Whoopsidaisy')
jobs_df.to_csv("jobs.csv")
try:
# find the Next button using selenium By
next_button = driver.find_element(By.XPATH, "//button[@aria-label='Page " + str(page_num + 1) + "']")
next_button.click()
time.sleep(2) # wait for page to load
page_num += 1 # update page number
scrollresults = driver.find_element(By.CLASS_NAME,"jobs-search-results-list")
for i in range(300, 5000, 100):
driver.execute_script("arguments[0].scrollTo(0, {})".format(i), scrollresults)
time.sleep(0.2)
print(page_num)
except:
# stop loop if no more Next button available
break