Merge pull request #70 from ferru97/feature/release_1_4_0

Feature/release 1 4 0
ferru97 · Nov 2, 2024 · 619365a · 619365a
2 parents 5509915 + 9617b19
commit 619365a
Show file tree

Hide file tree

Showing 10 changed files with 150 additions and 87 deletions.
diff --git a/PyPaperBot/Downloader.py b/PyPaperBot/Downloader.py
@@ -4,24 +4,23 @@
 from .HTMLparsers import getSchiHubPDF, SciHubUrls
 import random
 from .NetInfo import NetInfo
+from .Utils import URLjoin
 
 
 def setSciHubUrl():
+    print("Searching for a sci-hub mirror")
     r = requests.get(NetInfo.SciHub_URLs_repo, headers=NetInfo.HEADERS)
     links = SciHubUrls(r.text)
-    found = False
 
     for l in links:
         try:
+            print("Trying with {}...".format(l))
             r = requests.get(l, headers=NetInfo.HEADERS)
             if r.status_code == 200:
-                found = True
                 NetInfo.SciHub_URL = l
                 break
         except:
             pass
-    if found:
-        print("\nUsing {} as Sci-Hub instance\nYou can use a specific mirror mirror with the --scihub-mirror argument\n".format(NetInfo.SciHub_URL))
     else:
         print(
             "\nNo working Sci-Hub instance found!\nIf in your country Sci-Hub is not available consider using a VPN or a proxy\nYou can use a specific mirror mirror with the --scihub-mirror argument")
@@ -47,13 +46,17 @@ def saveFile(file_name, content, paper, dwn_source):
     paper.downloadedFrom = dwn_source
 
 
-def downloadPapers(papers, dwnl_dir, num_limit, SciHub_URL=None):
-    def URLjoin(*args):
-        return "/".join(map(lambda x: str(x).rstrip('/'), args))
+def downloadPapers(papers, dwnl_dir, num_limit, SciHub_URL=None, SciDB_URL=None):
 
     NetInfo.SciHub_URL = SciHub_URL
     if NetInfo.SciHub_URL is None:
         setSciHubUrl()
+    if SciDB_URL is not None:
+        NetInfo.SciDB_URL = SciDB_URL
+
+    print("\nUsing Sci-Hub mirror {}".format(NetInfo.SciHub_URL))
+    print("Using Sci-DB mirror {}".format(NetInfo.SciDB_URL))
+    print("You can use --scidb-mirror and --scidb-mirror to specify your're desired mirror URL\n")
 
     num_downloaded = 0
     paper_number = 1
@@ -65,37 +68,40 @@ def URLjoin(*args):
 
             pdf_dir = getSaveDir(dwnl_dir, p.getFileName())
 
-            faild = 0
+            failed = 0
             url = ""
-            while not p.downloaded and faild != 4:
+            while not p.downloaded and failed != 5:
                 try:
-                    dwn_source = 1  # 1 scihub 2 scholar
-                    if faild == 0 and p.DOI is not None:
+                    dwn_source = 1  # 1 scidb - 2 scihub - 3 scholar
+                    if failed == 0 and p.DOI is not None:
+                        url = URLjoin(NetInfo.SciDB_URL, p.DOI)
+                    if failed == 1 and p.DOI is not None:
                         url = URLjoin(NetInfo.SciHub_URL, p.DOI)
-                    if faild == 1 and p.scholar_link is not None:
+                        dwn_source = 2
+                    if failed == 2 and p.scholar_link is not None:
                         url = URLjoin(NetInfo.SciHub_URL, p.scholar_link)
-                    if faild == 2 and p.scholar_link is not None and p.scholar_link[-3:] == "pdf":
+                    if failed == 3 and p.scholar_link is not None and p.scholar_link[-3:] == "pdf":
                         url = p.scholar_link
-                        dwn_source = 2
-                    if faild == 3 and p.pdf_link is not None:
+                        dwn_source = 3
+                    if failed == 4 and p.pdf_link is not None:
                         url = p.pdf_link
-                        dwn_source = 2
+                        dwn_source = 3
 
                     if url != "":
                         r = requests.get(url, headers=NetInfo.HEADERS)
                         content_type = r.headers.get('content-type')
 
-                        if dwn_source == 1 and 'application/pdf' not in content_type:
-                            time.sleep(random.randint(1, 5))
+                        if (dwn_source == 1 or dwn_source == 2) and 'application/pdf' not in content_type and "application/octet-stream" not in content_type:
+                            time.sleep(random.randint(1, 4))
 
                             pdf_link = getSchiHubPDF(r.text)
                             if pdf_link is not None:
                                 r = requests.get(pdf_link, headers=NetInfo.HEADERS)
                                 content_type = r.headers.get('content-type')
 
-                        if 'application/pdf' in content_type:
+                        if 'application/pdf' in content_type or "application/octet-stream" in content_type:
                             paper_files.append(saveFile(pdf_dir, r.content, p, dwn_source))
                 except Exception:
                     pass
 
-                faild += 1
+                failed += 1
diff --git a/PyPaperBot/HTMLparsers.py b/PyPaperBot/HTMLparsers.py
@@ -5,6 +5,7 @@
 @author: Vito
 """
 from bs4 import BeautifulSoup
+import re
 
 
 def schoolarParser(html):
@@ -72,8 +73,10 @@ def getSchiHubPDF(html):
     result = None
     soup = BeautifulSoup(html, "html.parser")
 
-    iframe = soup.find(id='pdf')
-    plugin = soup.find(id='plugin')
+    iframe = soup.find(id='pdf') #scihub logic
+    plugin = soup.find(id='plugin') #scihub logic
+    download_scidb = soup.find("a", text=lambda text: text and "Download" in text, href=re.compile(r"\.pdf$")) #scidb logic
+    embed_scihub = soup.find("embed") #scihub logic
 
     if iframe is not None:
         result = iframe.get("src")
@@ -84,6 +87,12 @@ def getSchiHubPDF(html):
     if result is not None and result[0] != "h":
         result = "https:" + result
 
+    if download_scidb is not None and result is None:
+        result = download_scidb.get("href")
+
+    if embed_scihub is not None and result is None:
+        result = embed_scihub.get("original-url")
+
     return result
 
 

diff --git a/PyPaperBot/NetInfo.py b/PyPaperBot/NetInfo.py
@@ -1,4 +1,5 @@
 class NetInfo:
     SciHub_URL = None
+    SciDB_URL = "https://annas-archive.se/scidb/"
     HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}
     SciHub_URLs_repo = "https://sci-hub.41610.org/"
diff --git a/PyPaperBot/Paper.py b/PyPaperBot/Paper.py
@@ -6,8 +6,8 @@
 """
 import bibtexparser
 import re
-import csv
-import os
+import pandas as pd
+import urllib.parse
 
 
 class Paper:
@@ -28,12 +28,17 @@ def __init__(self,title=None, scholar_link=None, scholar_page=None, cites=None,
 
         self.downloaded = False
         self.downloadedFrom = 0  # 1-SciHub 2-scholar
+
+        self.use_doi_as_filename = False # if True, the filename will be the DOI
 
     def getFileName(self):
-        try:
-            return re.sub(r'[^\w\-_. ]', '_', self.title) + ".pdf"
-        except:
-            return "none.pdf"
+            try:
+                if self.use_doi_as_filename:
+                    return urllib.parse.quote(self.DOI, safe='') + ".pdf"
+                else:
+                    return re.sub(r'[^\w\-_. ]', '_', self.title) + ".pdf"
+            except:
+                return "none.pdf"
 
     def setBibtex(self, bibtex):
         x = bibtexparser.loads(bibtex, parser=None)
@@ -56,35 +61,44 @@ def canBeDownloaded(self):
         return self.DOI is not None or self.scholar_link is not None
 
     def generateReport(papers, path):
-        with open(path, mode="w", encoding='utf-8', newline='', buffering=1) as w_file:
-            content = ["Name", "Scholar Link", "DOI", "Bibtex",
-                       "PDF Name", "Year", "Scholar page", "Journal",
-                       "Downloaded", "Downloaded from", "Authors"]
-            file_writer = csv.DictWriter(w_file, delimiter=",", lineterminator=os.linesep, fieldnames=content)
-            file_writer.writeheader()
-
-            for p in papers:
-                pdf_name = p.getFileName() if p.downloaded else ""
-                bibtex_found = p.bibtex is not None
-
-                dwn_from = ""
-                if p.downloadedFrom == 1:
-                    dwn_from = "SciHub"
-                if p.downloadedFrom == 2:
-                    dwn_from = "Scholar"
-
-                file_writer.writerow({
-                    "Name": p.title,
-                    "Scholar Link": p.scholar_link,
-                    "DOI": p.DOI,
-                    "Bibtex": bibtex_found,
-                    "PDF Name": pdf_name,
-                    "Year": p.year,
-                    "Scholar page": p.scholar_page,
-                    "Journal": p.jurnal,
-                    "Downloaded": p.downloaded,
-                    "Downloaded from": dwn_from,
-                    "Authors": p.authors})
+        # Define the column names
+        columns = ["Name", "Scholar Link", "DOI", "Bibtex", "PDF Name",
+                   "Year", "Scholar page", "Journal", "Downloaded",
+                   "Downloaded from", "Authors"]
+
+        # Prepare data to populate the DataFrame
+        data = []
+        for p in papers:
+            pdf_name = p.getFileName() if p.downloaded else ""
+            bibtex_found = p.bibtex is not None
+
+            # Determine download source
+            dwn_from = ""
+            if p.downloadedFrom == 1:
+                dwn_from = "SciDB"
+            elif p.downloadedFrom == 2:
+                dwn_from = "SciHub"
+            elif p.downloadedFrom == 3:
+                dwn_from = "Scholar"
+
+            # Append row data as a dictionary
+            data.append({
+                "Name": p.title,
+                "Scholar Link": p.scholar_link,
+                "DOI": p.DOI,
+                "Bibtex": bibtex_found,
+                "PDF Name": pdf_name,
+                "Year": p.year,
+                "Scholar page": p.scholar_page,
+                "Journal": p.jurnal,
+                "Downloaded": p.downloaded,
+                "Downloaded from": dwn_from,
+                "Authors": p.authors
+            })
+
+        # Create a DataFrame and write to CSV
+        df = pd.DataFrame(data, columns=columns)
+        df.to_csv(path, index=False, encoding='utf-8')
 
     def generateBibtex(papers, path):
         content = ""

diff --git a/PyPaperBot/Utils.py b/PyPaperBot/Utils.py
@@ -0,0 +1,2 @@
+def URLjoin(*args):
+    return "/".join(map(lambda x: str(x).rstrip('/'), args))
diff --git a/PyPaperBot/__init__.py b/PyPaperBot/__init__.py
@@ -1 +1 @@
-__version__= "1.3.1"
+__version__= "1.4.0"
diff --git a/PyPaperBot/__main__.py b/PyPaperBot/__main__.py
@@ -4,15 +4,33 @@
 import sys
 import os
 import time
+import requests
 from .Paper import Paper
 from .PapersFilters import filterJurnals, filter_min_date, similarStrings
 from .Downloader import downloadPapers
 from .Scholar import ScholarPapersInfo
 from .Crossref import getPapersInfoFromDOIs
 from .proxy import proxy
+from .__init__ import __version__
+from urllib.parse import urljoin
+
+def checkVersion():
+    try :
+        print("PyPaperBot v" + __version__)
+        response = requests.get('https://pypi.org/pypi/pypaperbot/json')
+        latest_version = response.json()['info']['version']
+        if latest_version != __version__:
+            print("NEW VERSION AVAILABLE!\nUpdate with 'pip install PyPaperBot —upgrade' to get the latest features!\n")
+    except :
+        pass
+
 
 def start(query, scholar_results, scholar_pages, dwn_dir, proxy, min_date=None, num_limit=None, num_limit_type=None,
-          filter_jurnal_file=None, restrict=None, DOIs=None, SciHub_URL=None, chrome_version=None, cites=None):
+          filter_jurnal_file=None, restrict=None, DOIs=None, SciHub_URL=None, chrome_version=None, cites=None,
+          use_doi_as_filename=False, SciDB_URL=None):
+
+    if SciDB_URL is not None and "/scidb" not in SciDB_URL:
+        SciDB_URL = urljoin(SciDB_URL, "/scidb/")
 
     to_download = []
     if DOIs is None:
@@ -27,6 +45,7 @@ def start(query, scholar_results, scholar_pages, dwn_dir, proxy, min_date=None,
             DOI = DOIs[i]
             print("Searching paper {} of {} with DOI {}".format(num, len(DOIs), DOI))
             papersInfo = getPapersInfoFromDOIs(DOI, restrict)
+            papersInfo.use_doi_as_filename = use_doi_as_filename
             to_download.append(papersInfo)
 
             num += 1
@@ -45,7 +64,7 @@ def start(query, scholar_results, scholar_pages, dwn_dir, proxy, min_date=None,
         if num_limit_type is not None and num_limit_type == 1:
             to_download.sort(key=lambda x: int(x.cites_num) if x.cites_num is not None else 0, reverse=True)
 
-        downloadPapers(to_download, dwn_dir, num_limit, SciHub_URL)
+        downloadPapers(to_download, dwn_dir, num_limit, SciHub_URL, SciDB_URL)
 
     Paper.generateReport(to_download, dwn_dir + "result.csv")
     Paper.generateBibtex(to_download, dwn_dir + "bibtex.bib")
@@ -83,6 +102,8 @@ def main():
                         help='0:Download only Bibtex - 1:Down load only papers PDF')
     parser.add_argument('--scihub-mirror', default=None, type=str,
                         help='Mirror for downloading papers from sci-hub. If not set, it is selected automatically')
+    parser.add_argument('--annas-archive-mirror', default=None, type=str,
+                        help='Mirror for downloading papers from Annas Archive (SciDB). If not set, https://annas-archive.se is used')
     parser.add_argument('--scholar-results', default=10, type=int, choices=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                         help='Downloads the first x results for each scholar page(default/max=10)')
     parser.add_argument('--proxy', nargs='+', default=[],
@@ -91,6 +112,8 @@ def main():
                         help='Use a single proxy. Recommended if using --proxy gives errors')
     parser.add_argument('--selenium-chrome-version', type=int, default=None,
                         help='First three digits of the chrome version installed on your machine. If provided, selenium will be used for scholar search. It helps avoid bot detection but chrome must be installed.')
+    parser.add_argument('--use-doi-as-filename', action='store_true', default=False,
+                        help='Use DOIs as output file names')
     args = parser.parse_args()
 
     if args.single_proxy is not None:
@@ -123,6 +146,8 @@ def main():
     dwn_dir = args.dwn_dir.replace('\\', '/')
     if dwn_dir[-1] != '/':
         dwn_dir += "/"
+    if not os.path.exists(dwn_dir):
+        os.makedirs(dwn_dir, exist_ok=True)
 
     if args.max_dwn_year is not None and args.max_dwn_cites is not None:
         print("Error: Only one option between '--max-dwn-year' and '--max-dwn-cites' can be used ")
@@ -174,9 +199,11 @@ def main():
 
 
     start(args.query, args.scholar_results, scholar_pages, dwn_dir, proxy, args.min_year , max_dwn, max_dwn_type ,
-          args.journal_filter, args.restrict, DOIs, args.scihub_mirror, args.selenium_chrome_version, args.cites)
+          args.journal_filter, args.restrict, DOIs, args.scihub_mirror, args.selenium_chrome_version, args.cites,
+          args.use_doi_as_filename, args.annas_archive_mirror)
 
 if __name__ == "__main__":
+    checkVersion()
     main()
     print(
         """\nWork completed!

diff --git a/PyPaperBot/proxy.py b/PyPaperBot/proxy.py
@@ -1,6 +1,5 @@
 import socket
 import pyChainedProxy as socks
-from .Downloader import downloadPapers
 
 def proxy(pchain):
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		def URLjoin(*args):
		return "/".join(map(lambda x: str(x).rstrip('/'), args))