diff --git a/PyPaperBot/Downloader.py b/PyPaperBot/Downloader.py index 00f6089..8bf2b66 100644 --- a/PyPaperBot/Downloader.py +++ b/PyPaperBot/Downloader.py @@ -4,24 +4,23 @@ from .HTMLparsers import getSchiHubPDF, SciHubUrls import random from .NetInfo import NetInfo +from .Utils import URLjoin def setSciHubUrl(): + print("Searching for a sci-hub mirror") r = requests.get(NetInfo.SciHub_URLs_repo, headers=NetInfo.HEADERS) links = SciHubUrls(r.text) - found = False for l in links: try: + print("Trying with {}...".format(l)) r = requests.get(l, headers=NetInfo.HEADERS) if r.status_code == 200: - found = True NetInfo.SciHub_URL = l break except: pass - if found: - print("\nUsing {} as Sci-Hub instance\nYou can use a specific mirror mirror with the --scihub-mirror argument\n".format(NetInfo.SciHub_URL)) else: print( "\nNo working Sci-Hub instance found!\nIf in your country Sci-Hub is not available consider using a VPN or a proxy\nYou can use a specific mirror mirror with the --scihub-mirror argument") @@ -47,13 +46,17 @@ def saveFile(file_name, content, paper, dwn_source): paper.downloadedFrom = dwn_source -def downloadPapers(papers, dwnl_dir, num_limit, SciHub_URL=None): - def URLjoin(*args): - return "/".join(map(lambda x: str(x).rstrip('/'), args)) +def downloadPapers(papers, dwnl_dir, num_limit, SciHub_URL=None, SciDB_URL=None): NetInfo.SciHub_URL = SciHub_URL if NetInfo.SciHub_URL is None: setSciHubUrl() + if SciDB_URL is not None: + NetInfo.SciDB_URL = SciDB_URL + + print("\nUsing Sci-Hub mirror {}".format(NetInfo.SciHub_URL)) + print("Using Sci-DB mirror {}".format(NetInfo.SciDB_URL)) + print("You can use --scidb-mirror and --scidb-mirror to specify your're desired mirror URL\n") num_downloaded = 0 paper_number = 1 @@ -65,37 +68,40 @@ def URLjoin(*args): pdf_dir = getSaveDir(dwnl_dir, p.getFileName()) - faild = 0 + failed = 0 url = "" - while not p.downloaded and faild != 4: + while not p.downloaded and failed != 5: try: - dwn_source = 1 # 1 scihub 2 scholar - if faild == 0 and p.DOI is not None: + dwn_source = 1 # 1 scidb - 2 scihub - 3 scholar + if failed == 0 and p.DOI is not None: + url = URLjoin(NetInfo.SciDB_URL, p.DOI) + if failed == 1 and p.DOI is not None: url = URLjoin(NetInfo.SciHub_URL, p.DOI) - if faild == 1 and p.scholar_link is not None: + dwn_source = 2 + if failed == 2 and p.scholar_link is not None: url = URLjoin(NetInfo.SciHub_URL, p.scholar_link) - if faild == 2 and p.scholar_link is not None and p.scholar_link[-3:] == "pdf": + if failed == 3 and p.scholar_link is not None and p.scholar_link[-3:] == "pdf": url = p.scholar_link - dwn_source = 2 - if faild == 3 and p.pdf_link is not None: + dwn_source = 3 + if failed == 4 and p.pdf_link is not None: url = p.pdf_link - dwn_source = 2 + dwn_source = 3 if url != "": r = requests.get(url, headers=NetInfo.HEADERS) content_type = r.headers.get('content-type') - if dwn_source == 1 and 'application/pdf' not in content_type: - time.sleep(random.randint(1, 5)) + if (dwn_source == 1 or dwn_source == 2) and 'application/pdf' not in content_type and "application/octet-stream" not in content_type: + time.sleep(random.randint(1, 4)) pdf_link = getSchiHubPDF(r.text) if pdf_link is not None: r = requests.get(pdf_link, headers=NetInfo.HEADERS) content_type = r.headers.get('content-type') - if 'application/pdf' in content_type: + if 'application/pdf' in content_type or "application/octet-stream" in content_type: paper_files.append(saveFile(pdf_dir, r.content, p, dwn_source)) except Exception: pass - faild += 1 + failed += 1 diff --git a/PyPaperBot/HTMLparsers.py b/PyPaperBot/HTMLparsers.py index 07e01e7..2632b71 100644 --- a/PyPaperBot/HTMLparsers.py +++ b/PyPaperBot/HTMLparsers.py @@ -5,6 +5,7 @@ @author: Vito """ from bs4 import BeautifulSoup +import re def schoolarParser(html): @@ -72,8 +73,10 @@ def getSchiHubPDF(html): result = None soup = BeautifulSoup(html, "html.parser") - iframe = soup.find(id='pdf') - plugin = soup.find(id='plugin') + iframe = soup.find(id='pdf') #scihub logic + plugin = soup.find(id='plugin') #scihub logic + download_scidb = soup.find("a", text=lambda text: text and "Download" in text, href=re.compile(r"\.pdf$")) #scidb logic + embed_scihub = soup.find("embed") #scihub logic if iframe is not None: result = iframe.get("src") @@ -84,6 +87,12 @@ def getSchiHubPDF(html): if result is not None and result[0] != "h": result = "https:" + result + if download_scidb is not None and result is None: + result = download_scidb.get("href") + + if embed_scihub is not None and result is None: + result = embed_scihub.get("original-url") + return result diff --git a/PyPaperBot/NetInfo.py b/PyPaperBot/NetInfo.py index 52bcbf1..a70f584 100644 --- a/PyPaperBot/NetInfo.py +++ b/PyPaperBot/NetInfo.py @@ -1,4 +1,5 @@ class NetInfo: SciHub_URL = None + SciDB_URL = "https://annas-archive.se/scidb/" HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'} SciHub_URLs_repo = "https://sci-hub.41610.org/" \ No newline at end of file diff --git a/PyPaperBot/Paper.py b/PyPaperBot/Paper.py index 67ff1f7..2f3b385 100644 --- a/PyPaperBot/Paper.py +++ b/PyPaperBot/Paper.py @@ -6,8 +6,8 @@ """ import bibtexparser import re -import csv -import os +import pandas as pd +import urllib.parse class Paper: @@ -28,12 +28,17 @@ def __init__(self,title=None, scholar_link=None, scholar_page=None, cites=None, self.downloaded = False self.downloadedFrom = 0 # 1-SciHub 2-scholar + + self.use_doi_as_filename = False # if True, the filename will be the DOI def getFileName(self): - try: - return re.sub(r'[^\w\-_. ]', '_', self.title) + ".pdf" - except: - return "none.pdf" + try: + if self.use_doi_as_filename: + return urllib.parse.quote(self.DOI, safe='') + ".pdf" + else: + return re.sub(r'[^\w\-_. ]', '_', self.title) + ".pdf" + except: + return "none.pdf" def setBibtex(self, bibtex): x = bibtexparser.loads(bibtex, parser=None) @@ -56,35 +61,44 @@ def canBeDownloaded(self): return self.DOI is not None or self.scholar_link is not None def generateReport(papers, path): - with open(path, mode="w", encoding='utf-8', newline='', buffering=1) as w_file: - content = ["Name", "Scholar Link", "DOI", "Bibtex", - "PDF Name", "Year", "Scholar page", "Journal", - "Downloaded", "Downloaded from", "Authors"] - file_writer = csv.DictWriter(w_file, delimiter=",", lineterminator=os.linesep, fieldnames=content) - file_writer.writeheader() - - for p in papers: - pdf_name = p.getFileName() if p.downloaded else "" - bibtex_found = p.bibtex is not None - - dwn_from = "" - if p.downloadedFrom == 1: - dwn_from = "SciHub" - if p.downloadedFrom == 2: - dwn_from = "Scholar" - - file_writer.writerow({ - "Name": p.title, - "Scholar Link": p.scholar_link, - "DOI": p.DOI, - "Bibtex": bibtex_found, - "PDF Name": pdf_name, - "Year": p.year, - "Scholar page": p.scholar_page, - "Journal": p.jurnal, - "Downloaded": p.downloaded, - "Downloaded from": dwn_from, - "Authors": p.authors}) + # Define the column names + columns = ["Name", "Scholar Link", "DOI", "Bibtex", "PDF Name", + "Year", "Scholar page", "Journal", "Downloaded", + "Downloaded from", "Authors"] + + # Prepare data to populate the DataFrame + data = [] + for p in papers: + pdf_name = p.getFileName() if p.downloaded else "" + bibtex_found = p.bibtex is not None + + # Determine download source + dwn_from = "" + if p.downloadedFrom == 1: + dwn_from = "SciDB" + elif p.downloadedFrom == 2: + dwn_from = "SciHub" + elif p.downloadedFrom == 3: + dwn_from = "Scholar" + + # Append row data as a dictionary + data.append({ + "Name": p.title, + "Scholar Link": p.scholar_link, + "DOI": p.DOI, + "Bibtex": bibtex_found, + "PDF Name": pdf_name, + "Year": p.year, + "Scholar page": p.scholar_page, + "Journal": p.jurnal, + "Downloaded": p.downloaded, + "Downloaded from": dwn_from, + "Authors": p.authors + }) + + # Create a DataFrame and write to CSV + df = pd.DataFrame(data, columns=columns) + df.to_csv(path, index=False, encoding='utf-8') def generateBibtex(papers, path): content = "" diff --git a/PyPaperBot/Utils.py b/PyPaperBot/Utils.py new file mode 100644 index 0000000..91fb952 --- /dev/null +++ b/PyPaperBot/Utils.py @@ -0,0 +1,2 @@ +def URLjoin(*args): + return "/".join(map(lambda x: str(x).rstrip('/'), args)) \ No newline at end of file diff --git a/PyPaperBot/__init__.py b/PyPaperBot/__init__.py index 7f0253c..bfb895b 100644 --- a/PyPaperBot/__init__.py +++ b/PyPaperBot/__init__.py @@ -1 +1 @@ -__version__= "1.3.1" +__version__= "1.4.0" diff --git a/PyPaperBot/__main__.py b/PyPaperBot/__main__.py index abdf143..02c5e2d 100644 --- a/PyPaperBot/__main__.py +++ b/PyPaperBot/__main__.py @@ -4,15 +4,33 @@ import sys import os import time +import requests from .Paper import Paper from .PapersFilters import filterJurnals, filter_min_date, similarStrings from .Downloader import downloadPapers from .Scholar import ScholarPapersInfo from .Crossref import getPapersInfoFromDOIs from .proxy import proxy +from .__init__ import __version__ +from urllib.parse import urljoin + +def checkVersion(): + try : + print("PyPaperBot v" + __version__) + response = requests.get('https://pypi.org/pypi/pypaperbot/json') + latest_version = response.json()['info']['version'] + if latest_version != __version__: + print("NEW VERSION AVAILABLE!\nUpdate with 'pip install PyPaperBot —upgrade' to get the latest features!\n") + except : + pass + def start(query, scholar_results, scholar_pages, dwn_dir, proxy, min_date=None, num_limit=None, num_limit_type=None, - filter_jurnal_file=None, restrict=None, DOIs=None, SciHub_URL=None, chrome_version=None, cites=None): + filter_jurnal_file=None, restrict=None, DOIs=None, SciHub_URL=None, chrome_version=None, cites=None, + use_doi_as_filename=False, SciDB_URL=None): + + if SciDB_URL is not None and "/scidb" not in SciDB_URL: + SciDB_URL = urljoin(SciDB_URL, "/scidb/") to_download = [] if DOIs is None: @@ -27,6 +45,7 @@ def start(query, scholar_results, scholar_pages, dwn_dir, proxy, min_date=None, DOI = DOIs[i] print("Searching paper {} of {} with DOI {}".format(num, len(DOIs), DOI)) papersInfo = getPapersInfoFromDOIs(DOI, restrict) + papersInfo.use_doi_as_filename = use_doi_as_filename to_download.append(papersInfo) num += 1 @@ -45,7 +64,7 @@ def start(query, scholar_results, scholar_pages, dwn_dir, proxy, min_date=None, if num_limit_type is not None and num_limit_type == 1: to_download.sort(key=lambda x: int(x.cites_num) if x.cites_num is not None else 0, reverse=True) - downloadPapers(to_download, dwn_dir, num_limit, SciHub_URL) + downloadPapers(to_download, dwn_dir, num_limit, SciHub_URL, SciDB_URL) Paper.generateReport(to_download, dwn_dir + "result.csv") Paper.generateBibtex(to_download, dwn_dir + "bibtex.bib") @@ -83,6 +102,8 @@ def main(): help='0:Download only Bibtex - 1:Down load only papers PDF') parser.add_argument('--scihub-mirror', default=None, type=str, help='Mirror for downloading papers from sci-hub. If not set, it is selected automatically') + parser.add_argument('--annas-archive-mirror', default=None, type=str, + help='Mirror for downloading papers from Annas Archive (SciDB). If not set, https://annas-archive.se is used') parser.add_argument('--scholar-results', default=10, type=int, choices=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], help='Downloads the first x results for each scholar page(default/max=10)') parser.add_argument('--proxy', nargs='+', default=[], @@ -91,6 +112,8 @@ def main(): help='Use a single proxy. Recommended if using --proxy gives errors') parser.add_argument('--selenium-chrome-version', type=int, default=None, help='First three digits of the chrome version installed on your machine. If provided, selenium will be used for scholar search. It helps avoid bot detection but chrome must be installed.') + parser.add_argument('--use-doi-as-filename', action='store_true', default=False, + help='Use DOIs as output file names') args = parser.parse_args() if args.single_proxy is not None: @@ -123,6 +146,8 @@ def main(): dwn_dir = args.dwn_dir.replace('\\', '/') if dwn_dir[-1] != '/': dwn_dir += "/" + if not os.path.exists(dwn_dir): + os.makedirs(dwn_dir, exist_ok=True) if args.max_dwn_year is not None and args.max_dwn_cites is not None: print("Error: Only one option between '--max-dwn-year' and '--max-dwn-cites' can be used ") @@ -174,9 +199,11 @@ def main(): start(args.query, args.scholar_results, scholar_pages, dwn_dir, proxy, args.min_year , max_dwn, max_dwn_type , - args.journal_filter, args.restrict, DOIs, args.scihub_mirror, args.selenium_chrome_version, args.cites) + args.journal_filter, args.restrict, DOIs, args.scihub_mirror, args.selenium_chrome_version, args.cites, + args.use_doi_as_filename, args.annas_archive_mirror) if __name__ == "__main__": + checkVersion() main() print( """\nWork completed! diff --git a/PyPaperBot/proxy.py b/PyPaperBot/proxy.py index 58ba1ea..b326a56 100644 --- a/PyPaperBot/proxy.py +++ b/PyPaperBot/proxy.py @@ -1,6 +1,5 @@ import socket import pyChainedProxy as socks -from .Downloader import downloadPapers def proxy(pchain): diff --git a/README.md b/README.md index 08bdabe..a8ce241 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ # PyPaperBot -PyPaperBot is a Python tool for **downloading scientific papers** using Google Scholar, Crossref, and SciHub. +PyPaperBot is a Python tool for **downloading scientific papers and bibtex** using Google Scholar, Crossref, SciHub, and SciDB. The tool tries to download papers from different sources such as PDF provided by Scholar, Scholar related links, and Scihub. PyPaperbot is also able to download the **bibtex** of each paper. @@ -52,25 +52,27 @@ pip install PyPaperbot PyPaperBot arguments: -| Arguments | Description | Type | -|-----------------------------| ---------------------------------------------------------------------------------------- |--------| -| \-\-query | Query to make on Google Scholar or Google Scholar page link | string | -| \-\-cites | Paper ID (from scholar address bar when you search cites) if you want get only citations of that paper | string | string | -| \-\-doi | DOI of the paper to download (this option uses only SciHub to download) | string | -| \-\-doi-file | File .txt containing the list of paper's DOIs to download | string | -| \-\-scholar-pages | Number or range of Google Scholar pages to inspect. Each page has a maximum of 10 papers | string | -| \-\-dwn-dir | Directory path in which to save the result | string | -| \-\-min-year | Minimal publication year of the paper to download | int | -| \-\-max-dwn-year | Maximum number of papers to download sorted by year | int | -| \-\-max-dwn-cites | Maximum number of papers to download sorted by number of citations | int | -| \-\-journal-filter | CSV file path of the journal filter (More info on github) | string | -| \-\-restrict | 0:Download only Bibtex - 1:Down load only papers PDF | int | -| \-\-scihub-mirror | Mirror for downloading papers from sci-hub. If not set, it is selected automatically | string | -| \-\-scholar-results | Number of scholar results to bedownloaded when \-\-scholar-pages=1 | int | -| \-\-proxy | Proxies to be used. Please specify the protocol to be used. | string | -| \-\-single-proxy | Use a single proxy. Recommended if using --proxy gives errors. | string | -| \-\-selenium-chrome-version | First three digits of the chrome version installed on your machine. If provided, selenium will be used for scholar search. It helps avoid bot detection but chrome must be installed. | int | -| \-h | Shows the help | -- | +| Arguments | Description | Type | +|-----------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------| +| \-\-query | Query to make on Google Scholar or Google Scholar page link | string | +| \-\-cites | Paper ID (from scholar address bar when you search cites) if you want get only citations of that paper | string | string | +| \-\-doi | DOI of the paper to download (this option uses only SciHub to download) | string | +| \-\-doi-file | File .txt containing the list of paper's DOIs to download | string | +| \-\-scholar-pages | Number or range of Google Scholar pages to inspect. Each page has a maximum of 10 papers | string | +| \-\-dwn-dir | Directory path in which to save the result | string | +| \-\-min-year | Minimal publication year of the paper to download | int | +| \-\-max-dwn-year | Maximum number of papers to download sorted by year | int | +| \-\-max-dwn-cites | Maximum number of papers to download sorted by number of citations | int | +| \-\-journal-filter | CSV file path of the journal filter (More info on github) | string | +| \-\-restrict | 0:Download only Bibtex - 1:Download only papers PDF | int | +| \-\-scihub-mirror | Mirror for downloading papers from sci-hub. If not set, it is selected automatically | string | +| \-\-annas-archive-mirror | Mirror for downloading papers from Annas Archive (SciDB). If not set, https://annas-archive.se is used | string | +| \-\-scholar-results | Number of scholar results to bedownloaded when \-\-scholar-pages=1 | int | +| \-\-proxy | Proxies to be used. Please specify the protocol to be used. | string | +| \-\-single-proxy | Use a single proxy. Recommended if using --proxy gives errors. | string | +| \-\-selenium-chrome-version | First three digits of the chrome version installed on your machine. If provided, selenium will be used for scholar search. It helps avoid bot detection but chrome must be installed. | int | +| \-\-use-doi-as-filename | If provided, files are saved using the unique DOI as the filename rather than the default paper title | bool | +| \-h | Shows the help | -- | ### Note @@ -111,7 +113,7 @@ python -m PyPaperBot --query="Machine learning" --scholar-pages=4-7 --dwn-dir="C Download a paper given the DOI: ```bash -python -m PyPaperBot --doi="10.0086/s41037-711-0132-1" --dwn-dir="C:\User\example\papers"` +python -m PyPaperBot --doi="10.0086/s41037-711-0132-1" --dwn-dir="C:\User\example\papers" -use-doi-as-filename` ``` Download papers given a file containing the DOIs: @@ -132,10 +134,13 @@ Search papers that cite another (find ID in scholar address bar when you search python -m PyPaperBot --cites=3120460092236365926 ``` -Using a proxy +Using proxy ``` -python -m PyPaperBot --query=rheumatoid+arthritis --scholar-pages=1 --scholar-results=7 --dwn-dir=/download --proxy http://1.1.1.1::8080 https://8.8.8.8::8080 +python -m PyPaperBot --query=rheumatoid+arthritis --scholar-pages=1 --scholar-results=7 --dwn-dir=/download --proxy="http://1.1.1.1::8080,https://8.8.8.8::8080" +``` +``` +python -m PyPaperBot --query=rheumatoid+arthritis --scholar-pages=1 --scholar-results=7 --dwn-dir=/download -single-proxy=http://1.1.1.1::8080 ``` In termux, you can directly use ```PyPaperBot``` followed by arguments... diff --git a/setup.py b/setup.py index 7f6e1a7..68e9b19 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ setuptools.setup( name = 'PyPaperBot', packages = setuptools.find_packages(), - version = '1.3.1', + version = '1.4.0', license='MIT', description = 'PyPaperBot is a Python tool for downloading scientific papers using Google Scholar, Crossref, and SciHub.', long_description=long_description, @@ -14,7 +14,7 @@ author = 'Vito Ferrulli', author_email = 'vitof970@gmail.com', url = 'https://github.com/ferru97/PyPaperBot', - download_url = 'https://github.com/ferru97/PyPaperBot/archive/v1.3.1.tar.gz', + download_url = 'https://github.com/ferru97/PyPaperBot/archive/v1.4.0.tar.gz', keywords = ['download-papers','google-scholar', 'scihub', 'scholar', 'crossref', 'papers'], install_requires=[ 'astroid>=2.4.2,<=2.5',