-
Notifications
You must be signed in to change notification settings - Fork 0
/
download.py
91 lines (85 loc) · 4.06 KB
/
download.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
from tqdm import tqdm
from selenium import webdriver
from fake_useragent import UserAgent
import sys
'''
#Get the available address of scihub
from fake_useragent import UserAgent
from selenium import webdriver
import os
headers = {'User-Agent': UserAgent().random}
url = 'https://lovescihub.wordpress.com/'
abspath = os.path.abspath(r"C:\Program Files\Google\Chrome\Application\chromedriver.exe")
browser = webdriver.Chrome(executable_path=abspath)
browser.get(url)
findcurrentscihub = browser.find_element_by_xpath('//*[@id="post-22"]/div/p[2]/a[1]')
findcurrentscihub = browser.find_element_by_xpath('//*[@id="post-22"]/div/p[2]/a[2]')
findcurrentscihub = browser.find_element_by_xpath('//*[@id="post-22"]/div/p[2]/a[3]')
findcurrentscihub = browser.find_element_by_xpath('//*[@id="post-22"]/div/p[2]/a[4]')
scihub = findcurrentscihub.get_attribute('href')
browser.quit()'''
df = pd.read_csv('./1.csv')
n = len(df)
abspath = os.path.abspath(r"C:\Program Files\Google\Chrome\Application\chromedriver.exe")
for i in range(0,n):
headers = {'User-Agent': UserAgent().random}
title = df.loc[i,'title']
#if sys.getsizeof(title) < 255:
title = title.replace('?','').replace(':',':').replace('/','-').replace('a\c','ac').replace('S\~','S')
file = title+'.pdf'
if os.path.isfile(file) == True:
print('File {} already downloaded'.format(i))
elif len(os.getcwd()+'/'+file) > 250:
print('File name is too long.\n'+title)
else:
browser = webdriver.Chrome(executable_path=abspath)
url = df.loc[i,'url']
print(title)
browser.get(url)
time.sleep(10)
try:
searchelement = browser.find_element_by_id('download-pdf-popover')
searchelement.click()
downloadoption = browser.find_element_by_xpath('//*[@id="popover-content-download-pdf-popover"]/div/div/a[1]')
href = downloadoption.get_attribute('href')
print('popover: '+href)
pdf = requests.get(href, headers=headers, timeout=30)
redirect = BeautifulSoup(pdf.content, 'html.parser')
for redirect_message in tqdm(redirect.find_all(id="redirect-message")):
click_url = redirect_message.find('a').get('href')
click = requests.get(click_url, headers=headers, timeout=30)
with open(title+'.pdf','wb') as output:
output.write(click.content)
except:
try:
searchelement = browser.find_element_by_xpath('//*[@id="screen-reader-main-content"]/div/div[2]/a')
href = searchelement.get_attribute('href')
print('directLink: '+href)
pdf = requests.get(href, headers=headers, timeout=30)
redirect = BeautifulSoup(pdf.content, 'html.parser')
for redirect_message in tqdm(redirect.find_all(id="redirect-message")):
click_url = redirect_message.find('a').get('href')
click = requests.get(click_url, headers=headers, timeout=30)
with open(title+'.pdf','wb') as output:
output.write(click.content)
except:
downloadoption = browser.find_element_by_xpath('//*[@id="doi-link"]/a[1]')
doi = downloadoption.get_attribute('href')
'''
#print('No access, contact the library or try scihub.')
browser.get('https://sci-hub.ee/')
browser.find_element_by_xpath('//*[@id="input"]/form/input[2]').send_keys(doi)
browser.find_element_by_xpath('//*[@id="open"]').click()
time.sleep(30)
href = browser.find_element_by_xpath('//*[@id="pdf"]').get_attribute('src')
href = href.replace('#view=FitH','')
'''
with open('TBDFS.txt','w+') as file:
file.write(title+' '+doi+'\n')
time.sleep(10)
browser.quit()