Skip to content

Commit

Permalink
Merge pull request #70 from ferru97/feature/release_1_4_0
Browse files Browse the repository at this point in the history
Feature/release 1 4 0
  • Loading branch information
ferru97 authored Nov 2, 2024
2 parents 5509915 + 9617b19 commit 619365a
Show file tree
Hide file tree
Showing 10 changed files with 150 additions and 87 deletions.
46 changes: 26 additions & 20 deletions PyPaperBot/Downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,24 +4,23 @@
from .HTMLparsers import getSchiHubPDF, SciHubUrls
import random
from .NetInfo import NetInfo
from .Utils import URLjoin


def setSciHubUrl():
print("Searching for a sci-hub mirror")
r = requests.get(NetInfo.SciHub_URLs_repo, headers=NetInfo.HEADERS)
links = SciHubUrls(r.text)
found = False

for l in links:
try:
print("Trying with {}...".format(l))
r = requests.get(l, headers=NetInfo.HEADERS)
if r.status_code == 200:
found = True
NetInfo.SciHub_URL = l
break
except:
pass
if found:
print("\nUsing {} as Sci-Hub instance\nYou can use a specific mirror mirror with the --scihub-mirror argument\n".format(NetInfo.SciHub_URL))
else:
print(
"\nNo working Sci-Hub instance found!\nIf in your country Sci-Hub is not available consider using a VPN or a proxy\nYou can use a specific mirror mirror with the --scihub-mirror argument")
Expand All @@ -47,13 +46,17 @@ def saveFile(file_name, content, paper, dwn_source):
paper.downloadedFrom = dwn_source


def downloadPapers(papers, dwnl_dir, num_limit, SciHub_URL=None):
def URLjoin(*args):
return "/".join(map(lambda x: str(x).rstrip('/'), args))
def downloadPapers(papers, dwnl_dir, num_limit, SciHub_URL=None, SciDB_URL=None):

NetInfo.SciHub_URL = SciHub_URL
if NetInfo.SciHub_URL is None:
setSciHubUrl()
if SciDB_URL is not None:
NetInfo.SciDB_URL = SciDB_URL

print("\nUsing Sci-Hub mirror {}".format(NetInfo.SciHub_URL))
print("Using Sci-DB mirror {}".format(NetInfo.SciDB_URL))
print("You can use --scidb-mirror and --scidb-mirror to specify your're desired mirror URL\n")

num_downloaded = 0
paper_number = 1
Expand All @@ -65,37 +68,40 @@ def URLjoin(*args):

pdf_dir = getSaveDir(dwnl_dir, p.getFileName())

faild = 0
failed = 0
url = ""
while not p.downloaded and faild != 4:
while not p.downloaded and failed != 5:
try:
dwn_source = 1 # 1 scihub 2 scholar
if faild == 0 and p.DOI is not None:
dwn_source = 1 # 1 scidb - 2 scihub - 3 scholar
if failed == 0 and p.DOI is not None:
url = URLjoin(NetInfo.SciDB_URL, p.DOI)
if failed == 1 and p.DOI is not None:
url = URLjoin(NetInfo.SciHub_URL, p.DOI)
if faild == 1 and p.scholar_link is not None:
dwn_source = 2
if failed == 2 and p.scholar_link is not None:
url = URLjoin(NetInfo.SciHub_URL, p.scholar_link)
if faild == 2 and p.scholar_link is not None and p.scholar_link[-3:] == "pdf":
if failed == 3 and p.scholar_link is not None and p.scholar_link[-3:] == "pdf":
url = p.scholar_link
dwn_source = 2
if faild == 3 and p.pdf_link is not None:
dwn_source = 3
if failed == 4 and p.pdf_link is not None:
url = p.pdf_link
dwn_source = 2
dwn_source = 3

if url != "":
r = requests.get(url, headers=NetInfo.HEADERS)
content_type = r.headers.get('content-type')

if dwn_source == 1 and 'application/pdf' not in content_type:
time.sleep(random.randint(1, 5))
if (dwn_source == 1 or dwn_source == 2) and 'application/pdf' not in content_type and "application/octet-stream" not in content_type:
time.sleep(random.randint(1, 4))

pdf_link = getSchiHubPDF(r.text)
if pdf_link is not None:
r = requests.get(pdf_link, headers=NetInfo.HEADERS)
content_type = r.headers.get('content-type')

if 'application/pdf' in content_type:
if 'application/pdf' in content_type or "application/octet-stream" in content_type:
paper_files.append(saveFile(pdf_dir, r.content, p, dwn_source))
except Exception:
pass

faild += 1
failed += 1
13 changes: 11 additions & 2 deletions PyPaperBot/HTMLparsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
@author: Vito
"""
from bs4 import BeautifulSoup
import re


def schoolarParser(html):
Expand Down Expand Up @@ -72,8 +73,10 @@ def getSchiHubPDF(html):
result = None
soup = BeautifulSoup(html, "html.parser")

iframe = soup.find(id='pdf')
plugin = soup.find(id='plugin')
iframe = soup.find(id='pdf') #scihub logic
plugin = soup.find(id='plugin') #scihub logic
download_scidb = soup.find("a", text=lambda text: text and "Download" in text, href=re.compile(r"\.pdf$")) #scidb logic
embed_scihub = soup.find("embed") #scihub logic

if iframe is not None:
result = iframe.get("src")
Expand All @@ -84,6 +87,12 @@ def getSchiHubPDF(html):
if result is not None and result[0] != "h":
result = "https:" + result

if download_scidb is not None and result is None:
result = download_scidb.get("href")

if embed_scihub is not None and result is None:
result = embed_scihub.get("original-url")

return result


Expand Down
1 change: 1 addition & 0 deletions PyPaperBot/NetInfo.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
class NetInfo:
SciHub_URL = None
SciDB_URL = "https://annas-archive.se/scidb/"
HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}
SciHub_URLs_repo = "https://sci-hub.41610.org/"
84 changes: 49 additions & 35 deletions PyPaperBot/Paper.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
"""
import bibtexparser
import re
import csv
import os
import pandas as pd
import urllib.parse


class Paper:
Expand All @@ -28,12 +28,17 @@ def __init__(self,title=None, scholar_link=None, scholar_page=None, cites=None,

self.downloaded = False
self.downloadedFrom = 0 # 1-SciHub 2-scholar

self.use_doi_as_filename = False # if True, the filename will be the DOI

def getFileName(self):
try:
return re.sub(r'[^\w\-_. ]', '_', self.title) + ".pdf"
except:
return "none.pdf"
try:
if self.use_doi_as_filename:
return urllib.parse.quote(self.DOI, safe='') + ".pdf"
else:
return re.sub(r'[^\w\-_. ]', '_', self.title) + ".pdf"
except:
return "none.pdf"

def setBibtex(self, bibtex):
x = bibtexparser.loads(bibtex, parser=None)
Expand All @@ -56,35 +61,44 @@ def canBeDownloaded(self):
return self.DOI is not None or self.scholar_link is not None

def generateReport(papers, path):
with open(path, mode="w", encoding='utf-8', newline='', buffering=1) as w_file:
content = ["Name", "Scholar Link", "DOI", "Bibtex",
"PDF Name", "Year", "Scholar page", "Journal",
"Downloaded", "Downloaded from", "Authors"]
file_writer = csv.DictWriter(w_file, delimiter=",", lineterminator=os.linesep, fieldnames=content)
file_writer.writeheader()

for p in papers:
pdf_name = p.getFileName() if p.downloaded else ""
bibtex_found = p.bibtex is not None

dwn_from = ""
if p.downloadedFrom == 1:
dwn_from = "SciHub"
if p.downloadedFrom == 2:
dwn_from = "Scholar"

file_writer.writerow({
"Name": p.title,
"Scholar Link": p.scholar_link,
"DOI": p.DOI,
"Bibtex": bibtex_found,
"PDF Name": pdf_name,
"Year": p.year,
"Scholar page": p.scholar_page,
"Journal": p.jurnal,
"Downloaded": p.downloaded,
"Downloaded from": dwn_from,
"Authors": p.authors})
# Define the column names
columns = ["Name", "Scholar Link", "DOI", "Bibtex", "PDF Name",
"Year", "Scholar page", "Journal", "Downloaded",
"Downloaded from", "Authors"]

# Prepare data to populate the DataFrame
data = []
for p in papers:
pdf_name = p.getFileName() if p.downloaded else ""
bibtex_found = p.bibtex is not None

# Determine download source
dwn_from = ""
if p.downloadedFrom == 1:
dwn_from = "SciDB"
elif p.downloadedFrom == 2:
dwn_from = "SciHub"
elif p.downloadedFrom == 3:
dwn_from = "Scholar"

# Append row data as a dictionary
data.append({
"Name": p.title,
"Scholar Link": p.scholar_link,
"DOI": p.DOI,
"Bibtex": bibtex_found,
"PDF Name": pdf_name,
"Year": p.year,
"Scholar page": p.scholar_page,
"Journal": p.jurnal,
"Downloaded": p.downloaded,
"Downloaded from": dwn_from,
"Authors": p.authors
})

# Create a DataFrame and write to CSV
df = pd.DataFrame(data, columns=columns)
df.to_csv(path, index=False, encoding='utf-8')

def generateBibtex(papers, path):
content = ""
Expand Down
2 changes: 2 additions & 0 deletions PyPaperBot/Utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
def URLjoin(*args):
return "/".join(map(lambda x: str(x).rstrip('/'), args))
2 changes: 1 addition & 1 deletion PyPaperBot/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__= "1.3.1"
__version__= "1.4.0"
33 changes: 30 additions & 3 deletions PyPaperBot/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,33 @@
import sys
import os
import time
import requests
from .Paper import Paper
from .PapersFilters import filterJurnals, filter_min_date, similarStrings
from .Downloader import downloadPapers
from .Scholar import ScholarPapersInfo
from .Crossref import getPapersInfoFromDOIs
from .proxy import proxy
from .__init__ import __version__
from urllib.parse import urljoin

def checkVersion():
try :
print("PyPaperBot v" + __version__)
response = requests.get('https://pypi.org/pypi/pypaperbot/json')
latest_version = response.json()['info']['version']
if latest_version != __version__:
print("NEW VERSION AVAILABLE!\nUpdate with 'pip install PyPaperBot —upgrade' to get the latest features!\n")
except :
pass


def start(query, scholar_results, scholar_pages, dwn_dir, proxy, min_date=None, num_limit=None, num_limit_type=None,
filter_jurnal_file=None, restrict=None, DOIs=None, SciHub_URL=None, chrome_version=None, cites=None):
filter_jurnal_file=None, restrict=None, DOIs=None, SciHub_URL=None, chrome_version=None, cites=None,
use_doi_as_filename=False, SciDB_URL=None):

if SciDB_URL is not None and "/scidb" not in SciDB_URL:
SciDB_URL = urljoin(SciDB_URL, "/scidb/")

to_download = []
if DOIs is None:
Expand All @@ -27,6 +45,7 @@ def start(query, scholar_results, scholar_pages, dwn_dir, proxy, min_date=None,
DOI = DOIs[i]
print("Searching paper {} of {} with DOI {}".format(num, len(DOIs), DOI))
papersInfo = getPapersInfoFromDOIs(DOI, restrict)
papersInfo.use_doi_as_filename = use_doi_as_filename
to_download.append(papersInfo)

num += 1
Expand All @@ -45,7 +64,7 @@ def start(query, scholar_results, scholar_pages, dwn_dir, proxy, min_date=None,
if num_limit_type is not None and num_limit_type == 1:
to_download.sort(key=lambda x: int(x.cites_num) if x.cites_num is not None else 0, reverse=True)

downloadPapers(to_download, dwn_dir, num_limit, SciHub_URL)
downloadPapers(to_download, dwn_dir, num_limit, SciHub_URL, SciDB_URL)

Paper.generateReport(to_download, dwn_dir + "result.csv")
Paper.generateBibtex(to_download, dwn_dir + "bibtex.bib")
Expand Down Expand Up @@ -83,6 +102,8 @@ def main():
help='0:Download only Bibtex - 1:Down load only papers PDF')
parser.add_argument('--scihub-mirror', default=None, type=str,
help='Mirror for downloading papers from sci-hub. If not set, it is selected automatically')
parser.add_argument('--annas-archive-mirror', default=None, type=str,
help='Mirror for downloading papers from Annas Archive (SciDB). If not set, https://annas-archive.se is used')
parser.add_argument('--scholar-results', default=10, type=int, choices=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
help='Downloads the first x results for each scholar page(default/max=10)')
parser.add_argument('--proxy', nargs='+', default=[],
Expand All @@ -91,6 +112,8 @@ def main():
help='Use a single proxy. Recommended if using --proxy gives errors')
parser.add_argument('--selenium-chrome-version', type=int, default=None,
help='First three digits of the chrome version installed on your machine. If provided, selenium will be used for scholar search. It helps avoid bot detection but chrome must be installed.')
parser.add_argument('--use-doi-as-filename', action='store_true', default=False,
help='Use DOIs as output file names')
args = parser.parse_args()

if args.single_proxy is not None:
Expand Down Expand Up @@ -123,6 +146,8 @@ def main():
dwn_dir = args.dwn_dir.replace('\\', '/')
if dwn_dir[-1] != '/':
dwn_dir += "/"
if not os.path.exists(dwn_dir):
os.makedirs(dwn_dir, exist_ok=True)

if args.max_dwn_year is not None and args.max_dwn_cites is not None:
print("Error: Only one option between '--max-dwn-year' and '--max-dwn-cites' can be used ")
Expand Down Expand Up @@ -174,9 +199,11 @@ def main():


start(args.query, args.scholar_results, scholar_pages, dwn_dir, proxy, args.min_year , max_dwn, max_dwn_type ,
args.journal_filter, args.restrict, DOIs, args.scihub_mirror, args.selenium_chrome_version, args.cites)
args.journal_filter, args.restrict, DOIs, args.scihub_mirror, args.selenium_chrome_version, args.cites,
args.use_doi_as_filename, args.annas_archive_mirror)

if __name__ == "__main__":
checkVersion()
main()
print(
"""\nWork completed!
Expand Down
1 change: 0 additions & 1 deletion PyPaperBot/proxy.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import socket
import pyChainedProxy as socks
from .Downloader import downloadPapers

def proxy(pchain):

Expand Down
Loading

0 comments on commit 619365a

Please sign in to comment.