forked from YongyuLiu03/SEproject
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape.py
47 lines (27 loc) · 1.02 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import os
import json
import requests
from bs4 import BeautifulSoup, Tag
import undetected_chromedriver as webdriver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--use_subprocess")
USERNAME = "yyklc"
PASSWORD = "030302_LiuYongyu"
proxy = 'http://{}:{}@unblock.oxylabs.io:60000'.format(USERNAME, PASSWORD)
proxies = {
'http': proxy,
'https': proxy
}
chrome_options.add_argument(f"--proxy-server={proxies}")
# curl 'https://sandbox.oxylabs.io/products/' -U 'yyklc:030302_LiuYongyu' -x 'unblock.oxylabs.io:60000' -H 'x-oxylabs-geo-location: United States' -k
page = "https://sis.nyu.edu/psc/csprod/EMPLOYEE/SA/c/NYU_SR.NYU_CLS_SRCH.GBL"
response = requests.get(page, proxies=proxies, verify=False)
print(response.status_code)
content = response.content
# headless=False
browser = webdriver.Chrome(options=chrome_options)
browser.get(page)
browser.save_screenshot("screenshot.png")
soup = BeautifulSoup(response.text, "html.parser")
print(soup.body)