From a6a89318a787f248ae18f7899a1a9b5726a38151 Mon Sep 17 00:00:00 2001 From: suhan-paradkar <12suhangp34@gmail.com> Date: Wed, 12 May 2021 16:07:21 +0530 Subject: [PATCH 01/13] Add Scholar_pages argument --- PyPaperBot/Scholar.py | 10 +++++----- PyPaperBot/__main__.py | 18 +++++++++++++----- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/PyPaperBot/Scholar.py b/PyPaperBot/Scholar.py index 5d21371..81570fa 100644 --- a/PyPaperBot/Scholar.py +++ b/PyPaperBot/Scholar.py @@ -17,12 +17,12 @@ def waithIPchange(): time.sleep(30) return True -def scholar_requests(scholar_pages, url, restrict): +def scholar_requests(scholar_pages, url, restrict, scholar_results): javascript_error = "Sorry, we can't verify that you're not a robot when JavaScript is turned off" to_download = [] for i in scholar_pages: while True: - res_url = url % (10 * (i - 1)) + res_url = url % (scholar_results * (i - 1)) html = requests.get(res_url, headers=NetInfo.HEADERS) html = html.text @@ -34,7 +34,7 @@ def scholar_requests(scholar_pages, url, restrict): break papers = schoolarParser(html) - print("\nGoogle Scholar page {} : {} papers found".format(i,len(papers))) + print("\nGoogle Scholar page {} : {} papers found".format(i,scholar_results)) if(len(papers)>0): papersInfo = getPapersInfo(papers, url, restrict) @@ -49,7 +49,7 @@ def scholar_requests(scholar_pages, url, restrict): -def ScholarPapersInfo(query, scholar_pages, restrict, min_date=None): +def ScholarPapersInfo(query, scholar_pages, restrict, min_date=None, scholar_results): url = r"https://scholar.google.com/scholar?hl=en&q="+query+"&as_vis=1&as_sdt=1,5&start=%d" if min_date!=None: @@ -58,6 +58,6 @@ def ScholarPapersInfo(query, scholar_pages, restrict, min_date=None): if len(query)>7 and (query[0:7]=="http://" or query[0:8]=="https://"): url = query - to_download = scholar_requests(scholar_pages, url, restrict) + to_download = scholar_requests(scholar_pages, url, restrict, scholar_results) return [item for sublist in to_download for item in sublist] diff --git a/PyPaperBot/__main__.py b/PyPaperBot/__main__.py index 5c166b7..f24145c 100644 --- a/PyPaperBot/__main__.py +++ b/PyPaperBot/__main__.py @@ -9,12 +9,12 @@ from .Crossref import getPapersInfoFromDOIs -def start(query, scholar_pages, dwn_dir, min_date=None, num_limit=None, num_limit_type=None, filter_jurnal_file=None, restrict=None, DOIs=None, SciHub_URL=None): +def start(query, scholar_results, scholar_pages, dwn_dir, min_date=None, num_limit=None, num_limit_type=None, filter_jurnal_file=None, restrict=None, DOIs=None, SciHub_URL=None): to_download = [] if DOIs==None: print("Query: {}".format(query)) - to_download = ScholarPapersInfo(query, scholar_pages, restrict, min_date) + to_download = ScholarPapersInfo(query, scholar_pages, restrict, min_date, scholar_results) else: print("Downloading papers from DOIs\n") num = 1 @@ -67,7 +67,7 @@ def main(): parser.add_argument('--journal-filter', default=None, type=str ,help='CSV file path of the journal filter (More info on github)') parser.add_argument('--restrict', default=None, type=int ,choices=[0,1], help='0:Download only Bibtex - 1:Down load only papers PDF') parser.add_argument('--scihub-mirror', default=None, type=str, help='Mirror for downloading papers from sci-hub. If not set, it is selected automatically') - + parser.add_argument('--scholar-results', default=10, type=int, choices=[1,2,3,4,5,6,7,8,9,10], help='Downloads the first x results in a scholar page(max=10)') args = parser.parse_args() if args.query==None and args.doi_file==None and args.doi==None: @@ -78,10 +78,18 @@ def main(): print("Error: Only one option between '--query', '--doi-file' and '--doi' can be used") sys.exit() + if args.scholar_results>10 or args.scholar_results<1: + print("Error: value of '--scholar-results' must be between 1 to 10") + sys.exit() + if args.dwn_dir==None: print("Error, provide the directory path in which to save the results") sys.exit() - + + if args.scholar_results!=10 and args.scholar_pages>1: + print("Scholar results is applicable only for --scholar-pages=1 at this moment") + sys.exit() + dwn_dir = args.dwn_dir.replace('\\', '/') if dwn_dir[len(dwn_dir)-1]!='/': dwn_dir = dwn_dir + "/" @@ -135,7 +143,7 @@ def main(): max_dwn_type = 1 - start(args.query, scholar_pages, dwn_dir, args.min_year , max_dwn, max_dwn_type , args.journal_filter, args.restrict, DOIs, args.scihub_mirror) + start(args.query, scholar_results, scholar_pages, dwn_dir, args.min_year , max_dwn, max_dwn_type , args.journal_filter, args.restrict, DOIs, args.scihub_mirror) if __name__ == "__main__": main() From 5bccdec30fabccd466450e9f64d459d9b3afe78d Mon Sep 17 00:00:00 2001 From: suhan-paradkar <12suhangp34@gmail.com> Date: Wed, 12 May 2021 16:20:09 +0530 Subject: [PATCH 02/13] Update README.md --- README.md | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index dadb71e..6e21902 100644 --- a/README.md +++ b/README.md @@ -16,12 +16,30 @@ PyPaerbot is also able to download the **bibtex** of each paper. ## Installation +### For normal Users + Use `pip` to install from pypi: ```bash pip install PyPaperBot ``` +### For Termux users + +Since numpy cannot be directly installed.... + +```pkg install wget +wget https://its-pointless.github.io/setup-pointless-repo.sh +pkg install numpy +export CFLAGS="-Wno-deprecated-declarations -Wno-unreachable-code" +pip install pandas +``` + +and + +```pip install PyPaperbot +``` + ## How to use PyPaperBot arguments: @@ -39,6 +57,7 @@ PyPaperBot arguments: | \-\-journal-filter | CSV file path of the journal filter (More info on github) | string | | \-\-restrict | 0:Download only Bibtex - 1:Down load only papers PDF | int | | \-\-scihub-mirror | Mirror for downloading papers from sci-hub. If not set, it is selected automatically | string | +| \-\-scholar-results| Number of scholar results to bedownloaded when \-\-scholar-pages=1 | int | | \-h | Shows the help | -- | ### Note @@ -56,7 +75,7 @@ The argument *\-\-journal-filter* require the path of a CSV containing a list o The argument *\-\-doi-file* require the path of a txt file containing the list of paper's DOIs to download organized with one DOI per line [Example](https://github.com/ferru97/PyPaperBot/blob/master/file_examples/papers.txt) -## SchiHub access +## SciHub access If access to SciHub is blocked in your country, consider using a free VPN service like [ProtonVPN](https://protonvpn.com/) @@ -92,6 +111,8 @@ If it doesn't work, try to use *py* instead of *python* i.e. py -m PyPaperBot --doi="10.0086/s41037-711-0132-1" --dwn-dir="C:\User\example\papers"` ``` +In termux, you can directly use ```PyPaperBot``` followed by arguments... + ## Contributions Feel free to contribute to this project by proposing any change, fix, and enhancement on the **dev** branch From d6e289809b3ca2e8733c457b585a644398518dae Mon Sep 17 00:00:00 2001 From: suhan-paradkar <12suhangp34@gmail.com> Date: Wed, 12 May 2021 16:22:19 +0530 Subject: [PATCH 03/13] Update README.md again --- README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 6e21902..6d6111d 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,8 @@ pip install PyPaperBot Since numpy cannot be directly installed.... -```pkg install wget +``` +pkg install wget wget https://its-pointless.github.io/setup-pointless-repo.sh pkg install numpy export CFLAGS="-Wno-deprecated-declarations -Wno-unreachable-code" @@ -37,7 +38,8 @@ pip install pandas and -```pip install PyPaperbot +``` +pip install PyPaperbot ``` ## How to use From 299e422dc09d23b3bf2592dbc6e3a8ae6ab3dd05 Mon Sep 17 00:00:00 2001 From: "@suhan-paradkar" <12suhangp34@gmail.com> Date: Thu, 13 May 2021 13:34:22 +0530 Subject: [PATCH 04/13] Remove a LOT of whitespace --- PyPaperBot/Crossref.py | 31 +++++++++++----------- PyPaperBot/Downloader.py | 35 ++++++++++++------------ PyPaperBot/HTMLparsers.py | 24 ++++++++--------- PyPaperBot/Paper.py | 53 +++++++++++++++++-------------------- PyPaperBot/PapersFilters.py | 24 ++++++++--------- PyPaperBot/Scholar.py | 14 +++++----- PyPaperBot/__main__.py | 48 ++++++++++++++++----------------- 7 files changed, 111 insertions(+), 118 deletions(-) diff --git a/PyPaperBot/Crossref.py b/PyPaperBot/Crossref.py index 613974f..3b3f4c1 100644 --- a/PyPaperBot/Crossref.py +++ b/PyPaperBot/Crossref.py @@ -9,7 +9,7 @@ def getBibtex(DOI): - try: + try: url_bibtex = "http://api.crossref.org/works/" + DOI + "/transform/application/x-bibtex" x = requests.get(url_bibtex) return str(x.text) @@ -20,7 +20,7 @@ def getBibtex(DOI): def getPapersInfoFromDOIs(DOI, restrict): paper_found = Paper() paper_found.DOI = DOI - + try: paper = get_entity(DOI, EntityType.PUBLICATION, OutputType.JSON) if paper!=None and len(paper)>0: @@ -28,14 +28,14 @@ def getPapersInfoFromDOIs(DOI, restrict): paper_found.title = paper["title"][0] if "short-container-title" in paper and len(paper["short-container-title"])>0: paper_found.jurnal = paper["short-container-title"][0] - - if restrict==None or restrict!=1: + + if restrict==None or restrict!=1: paper_found.setBibtex(getBibtex(paper_found.DOI)) except: print("Paper not found "+DOI) - + return paper_found - + #Get paper information from Crossref and return a list of Paper def getPapersInfo(papers, scholar_search_link, restrict): @@ -44,7 +44,7 @@ def getPapersInfo(papers, scholar_search_link, restrict): for paper in papers: title = paper['title'] queries = {'query.bibliographic': title.lower(),'sort':'relevance',"select":"DOI,title,deposited,author,short-container-title"} - + print("Searching paper {} of {} on Crossref...".format(num,len(papers))) num += 1 @@ -53,11 +53,11 @@ def getPapersInfo(papers, scholar_search_link, restrict): while True: try: for el in iterate_publications_as_json(max_results=30, queries=queries): - + el_date = 0 if "deposited" in el and "timestamp" in el["deposited"]: el_date = int(el["deposited"]["timestamp"]) - + if (paper_found.DOI==None or el_date>found_timestamp) and "title" in el and similarStrings(title.lower() ,el["title"][0].lower())>0.75: found_timestamp = el_date @@ -65,18 +65,17 @@ def getPapersInfo(papers, scholar_search_link, restrict): paper_found.DOI = el["DOI"].strip().lower() if "short-container-title" in el and len(el["short-container-title"])>0: paper_found.jurnal = el["short-container-title"][0] - - if restrict==None or restrict!=1: + + if restrict==None or restrict!=1: paper_found.setBibtex(getBibtex(paper_found.DOI)) - + break except ConnectionError as e: print("Wait 10 seconds and try again...") time.sleep(10) - - + papers_return.append(paper_found) - + time.sleep(random.randint(1,10)) - + return papers_return diff --git a/PyPaperBot/Downloader.py b/PyPaperBot/Downloader.py index c7e255e..bc65e9f 100644 --- a/PyPaperBot/Downloader.py +++ b/PyPaperBot/Downloader.py @@ -32,66 +32,65 @@ def getSaveDir(folder, fname): while path.exists(dir_): n += 1 dir_ = path.join(folder, "("+str(n)+")"+fname) - + return dir_ -def saveFile(file_name,content, paper,dwn_source): +def saveFile(file_name,content, paper,dwn_source): f = open(file_name, 'wb') f.write(content) f.close() paper.downloaded = True paper.downloadedFrom = dwn_source - - + def downloadPapers(papers, dwnl_dir, num_limit, SciHub_URL=None): def URLjoin(*args): return "/".join(map(lambda x: str(x).rstrip('/'), args)) - + NetInfo.SciHub_URL = SciHub_URL if NetInfo.SciHub_URL==None: setSciHubUrl() - + num_downloaded = 0 paper_number = 1 paper_files = [] - for p in papers: + for p in papers: if p.canBeDownloaded() and (num_limit==None or num_downloaded {}".format(paper_number, len(papers), p.title)) paper_number += 1 - + pdf_dir = getSaveDir(dwnl_dir, p.getFileName()) - + faild = 0 - while p.downloaded==False and faild!=4: + while p.downloaded==False and faild!=4: try: - dwn_source = 1 #1 scihub 2 scholar + dwn_source = 1 #1 scihub 2 scholar if faild==0 and p.DOI!=None: url = URLjoin(NetInfo.SciHub_URL, p.DOI) if faild==1 and p.scholar_link!=None: - url = URLjoin(NetInfo.SciHub_URL, p.scholar_link) + url = URLjoin(NetInfo.SciHub_URL, p.scholar_link) if faild==2 and p.scholar_link!=None and p.scholar_link[-3:]=="pdf": url = p.scholar_link dwn_source = 2 if faild==3 and p.pdf_link!=None: url = p.pdf_link - dwn_source = 2 - + dwn_source = 2 + if url!="": r = requests.get(url, headers=NetInfo.HEADERS) content_type = r.headers.get('content-type') - + if dwn_source==1 and 'application/pdf' not in content_type: time.sleep(random.randint(1,5)) - + pdf_link = getSchiHubPDF(r.text) if(pdf_link != None): r = requests.get(pdf_link, headers=NetInfo.HEADERS) content_type = r.headers.get('content-type') - + if 'application/pdf' in content_type: paper_files.append(saveFile(pdf_dir,r.content,p,dwn_source)) except Exception: pass - + faild += 1 diff --git a/PyPaperBot/HTMLparsers.py b/PyPaperBot/HTMLparsers.py index 8d6784c..937ce86 100644 --- a/PyPaperBot/HTMLparsers.py +++ b/PyPaperBot/HTMLparsers.py @@ -10,7 +10,7 @@ def schoolarParser(html): result = [] soup = BeautifulSoup(html, "html.parser") for element in soup.findAll("div", class_="gs_r gs_or gs_scl"): - if isBook(element) == False: + if isBook(element) == False: title = None link = None link_pdf = None @@ -19,7 +19,7 @@ def schoolarParser(html): authors = None for h3 in element.findAll("h3", class_="gs_rt"): found = False - for a in h3.findAll("a"): + for a in h3.findAll("a"): if found == False: title = a.text link = a.get("href") @@ -48,7 +48,7 @@ def schoolarParser(html): year = None else: year = str(year) - if title!=None: + if title!=None: result.append({ 'title' : title, 'link' : link, @@ -56,8 +56,8 @@ def schoolarParser(html): 'link_pdf' : link_pdf, 'year' : year, 'authors' : authors}) - return result - + return result + def isBook(tag): @@ -72,19 +72,19 @@ def isBook(tag): def getSchiHubPDF(html): result = None soup = BeautifulSoup(html, "html.parser") - + iframe = soup.find(id='pdf') plugin = soup.find(id='plugin') - + if iframe!=None: result = iframe.get("src") - + if plugin!=None and result==None: result = plugin.get("src") - + if result!=None and result[0]!="h": result = "https:"+result - + return result def SciHubUrls(html): @@ -96,6 +96,6 @@ def SciHubUrls(html): link = a.get("href") if link.startswith("https://sci-hub.") or link.startswith("http://sci-hub."): result.append(link) - + return result - + diff --git a/PyPaperBot/Paper.py b/PyPaperBot/Paper.py index 9ce2191..feb16df 100644 --- a/PyPaperBot/Paper.py +++ b/PyPaperBot/Paper.py @@ -10,38 +10,38 @@ import os class Paper: - - + + def __init__(self,title=None, scholar_link=None, scholar_page=None, cites=None, link_pdf=None, year=None, authors=None): self.title = title self.scholar_page = scholar_page - self.scholar_link = scholar_link + self.scholar_link = scholar_link self.pdf_link = link_pdf self.year = year self.authors = authors - + self.jurnal = None self.cites_num = None self.bibtex = None self.DOI = None - + self.downloaded = False self.downloadedFrom = 0 #1-SciHub 2-scholar - + def getFileName(self): try: return re.sub('[^\w\-_\. ]', '_', self.title)+".pdf" except: return "none.pdf" - + def setBibtex(self,bibtex): x=bibtexparser.loads(bibtex, parser=None) x=x.entries - + self.bibtex = bibtex - + try: if "year" in x[0]: self.year=x[0]["year"] @@ -50,25 +50,25 @@ def setBibtex(self,bibtex): self.jurnal=x[0]["journal"].replace("\\","") if "journal" in x[0] else None if self.jurnal==None: self.jurnal=x[0]["publisher"].replace("\\","") if "publisher" in x[0] else None - + except: pass - - + + def canBeDownloaded(self): if self.DOI!=None or self.scholar_link!=None: return True return False - - - def generateReport(papers, path): + + + def generateReport(papers, path): with open(path, mode="w", encoding='utf-8', newline='', buffering=1) as w_file: content = ["Name", "Scholar Link", "DOI", "Bibtex", "PDF Name", "Year", "Scholar page", "Journal", "Downloaded", "Downloaded from", "Authors"] file_writer = csv.DictWriter(w_file, delimiter = ",", lineterminator=os.linesep, fieldnames=content) file_writer.writeheader() - + for p in papers: pdf_name = p.getFileName() if p.downloaded==True else "" bibtex_found = True if p.bibtex!=None else False @@ -78,7 +78,7 @@ def generateReport(papers, path): dwn_from = "SciHub" if p.downloadedFrom == 2: dwn_from = "Scholar" - + file_writer.writerow({ "Name" : p.title, "Scholar Link" : p.scholar_link, @@ -90,25 +90,20 @@ def generateReport(papers, path): "Journal" : p.jurnal, "Downloaded" : p.downloaded, "Downloaded from" : dwn_from, - "Authors" : p.authors}) - - + "Authors" : p.authors}) + + def generateBibtex(papers, path): - content = "" + content = "" for p in papers: if p.bibtex!=None: content += p.bibtex+"\n" - - + + relace_list = ["\ast","*","#"] for c in relace_list: content = content.replace(c,"") - + f = open(path, "w", encoding="latin-1", errors="ignore") f.write(str(content)) f.close() - - - - - diff --git a/PyPaperBot/PapersFilters.py b/PyPaperBot/PapersFilters.py index 7b0f983..a8c2a13 100644 --- a/PyPaperBot/PapersFilters.py +++ b/PyPaperBot/PapersFilters.py @@ -11,33 +11,33 @@ def similarStrings(a, b): return SequenceMatcher(None, a, b).ratio() """ -Input +Input papers: list of Paper csv_path: path of a csv containing the journals to include (consult the GitHub page for the csv format) Output - result: list of Paper published by the journals included in the csv + result: list of Paper published by the journals included in the csv """ def filterJurnals(papers,csv_path): result = [] df = pd.read_csv(csv_path, sep=";") journal_list = list(df["journal_list"]) include_list = list(df["include_list"]) - + for p in papers: good = False if (p.jurnal!=None and len(p.jurnal)>0) else True - if p.jurnal!=None: + if p.jurnal!=None: for jurnal,include in zip(journal_list,include_list): - if include==1 and similarStrings(p.jurnal,jurnal)>=0.8: + if include==1 and similarStrings(p.jurnal,jurnal)>=0.8: good = True - + if good == True: result.append(p) - + return result - + """ -Input +Input papers: list of Paper min_year: minimal publication year accepted Output @@ -45,9 +45,9 @@ def filterJurnals(papers,csv_path): """ def filter_min_date(list_papers,min_year): new_list = [] - + for paper in list_papers: if paper.year!=None and int(paper.year)>=min_year: new_list.append(paper) - - return new_list \ No newline at end of file + + return new_list diff --git a/PyPaperBot/Scholar.py b/PyPaperBot/Scholar.py index 81570fa..bfda0c6 100644 --- a/PyPaperBot/Scholar.py +++ b/PyPaperBot/Scholar.py @@ -25,7 +25,7 @@ def scholar_requests(scholar_pages, url, restrict, scholar_results): res_url = url % (scholar_results * (i - 1)) html = requests.get(res_url, headers=NetInfo.HEADERS) html = html.text - + if javascript_error in html: is_continue = waithIPchange() if not is_continue: @@ -35,12 +35,12 @@ def scholar_requests(scholar_pages, url, restrict, scholar_results): papers = schoolarParser(html) print("\nGoogle Scholar page {} : {} papers found".format(i,scholar_results)) - + if(len(papers)>0): papersInfo = getPapersInfo(papers, url, restrict) info_valids = functools.reduce(lambda a,b : a+1 if b.DOI!=None else a, papersInfo, 0) print("Papers found on Crossref: {}/{}\n".format(info_valids,len(papers))) - + to_download.append(papersInfo) else: print("Paper not found...") @@ -50,14 +50,14 @@ def scholar_requests(scholar_pages, url, restrict, scholar_results): def ScholarPapersInfo(query, scholar_pages, restrict, min_date=None, scholar_results): - + url = r"https://scholar.google.com/scholar?hl=en&q="+query+"&as_vis=1&as_sdt=1,5&start=%d" if min_date!=None: url += "&as_ylo="+str(min_date) if len(query)>7 and (query[0:7]=="http://" or query[0:8]=="https://"): - url = query - + url = query + to_download = scholar_requests(scholar_pages, url, restrict, scholar_results) - + return [item for sublist in to_download for item in sublist] diff --git a/PyPaperBot/__main__.py b/PyPaperBot/__main__.py index f24145c..d2a0bd3 100644 --- a/PyPaperBot/__main__.py +++ b/PyPaperBot/__main__.py @@ -10,10 +10,10 @@ def start(query, scholar_results, scholar_pages, dwn_dir, min_date=None, num_limit=None, num_limit_type=None, filter_jurnal_file=None, restrict=None, DOIs=None, SciHub_URL=None): - + to_download = [] if DOIs==None: - print("Query: {}".format(query)) + print("Query: {}".format(query)) to_download = ScholarPapersInfo(query, scholar_pages, restrict, min_date, scholar_results) else: print("Downloading papers from DOIs\n") @@ -27,32 +27,32 @@ def start(query, scholar_results, scholar_pages, dwn_dir, min_date=None, num_lim num += 1 i += 1 - - + + if restrict!=0 and to_download: if filter_jurnal_file!=None: to_download = filterJurnals(to_download,filter_jurnal_file) - + if min_date!=None: - to_download = filter_min_date(to_download,min_date) - - if num_limit_type!=None and num_limit_type==0: + to_download = filter_min_date(to_download,min_date) + + if num_limit_type!=None and num_limit_type==0: to_download.sort(key=lambda x: int(x.sc_year) if x.sc_year!=None else 0, reverse=True) - - if num_limit_type!=None and num_limit_type==1: + + if num_limit_type!=None and num_limit_type==1: to_download.sort(key=lambda x: int(x.sc_cites) if x.sc_cites!=None else 0, reverse=True) - + downloadPapers(to_download, dwn_dir, num_limit, SciHub_URL) Paper.generateReport(to_download,dwn_dir+"result.csv") Paper.generateBibtex(to_download,dwn_dir+"bibtex.bib") - + def main(): print("""PyPaperBot is a Python tool for downloading scientific papers using Google Scholar, Crossref and SciHub.\nIf you like this project, you can give me a cup of coffee at --> https://www.paypal.com/paypalme/ferru97 <-- :)\n""") - + parser = argparse.ArgumentParser(description='PyPaperBot is python tool to search and dwonload scientific papers using Google Scholar, Crossref and SciHub') parser.add_argument('--query', type=str, default=None, help='Query to make on Google Scholar or Google Scholar page link') parser.add_argument('--doi', type=str, default=None, help='DOI of the paper to download (this option uses only SciHub to download)') @@ -69,11 +69,11 @@ def main(): parser.add_argument('--scihub-mirror', default=None, type=str, help='Mirror for downloading papers from sci-hub. If not set, it is selected automatically') parser.add_argument('--scholar-results', default=10, type=int, choices=[1,2,3,4,5,6,7,8,9,10], help='Downloads the first x results in a scholar page(max=10)') args = parser.parse_args() - + if args.query==None and args.doi_file==None and args.doi==None: print("Error, provide at least one of the following arguments: --query or --file") sys.exit() - + if (args.query!=None and args.doi_file!=None) or (args.query!=None and args.doi!=None) or (args.doi!=None and args.doi_file!=None): print("Error: Only one option between '--query', '--doi-file' and '--doi' can be used") sys.exit() @@ -93,11 +93,11 @@ def main(): dwn_dir = args.dwn_dir.replace('\\', '/') if dwn_dir[len(dwn_dir)-1]!='/': dwn_dir = dwn_dir + "/" - + if args.max_dwn_year != None and args.max_dwn_cites != None: print("Error: Only one option between '--max-dwn-year' and '--max-dwn-cites' can be used ") sys.exit() - + if(args.query != None): if args.scholar_pages: try: @@ -119,20 +119,20 @@ def main(): scholar_pages = 0 - DOIs = None + DOIs = None if args.doi_file!=None: - DOIs = [] + DOIs = [] f = args.doi_file.replace('\\', '/') with open(f) as file_in: for line in file_in: if line[len(line)-1]=='\n': DOIs.append(line[:-1]) else: - DOIs.append(line) + DOIs.append(line) + + if args.doi!=None: + DOIs = [args.doi] - if args.doi!=None: - DOIs = [args.doi] - max_dwn = None max_dwn_type = None if args.max_dwn_year != None: @@ -141,7 +141,7 @@ def main(): if args.max_dwn_cites != None: max_dwn = args.max_dwn_cites max_dwn_type = 1 - + start(args.query, scholar_results, scholar_pages, dwn_dir, args.min_year , max_dwn, max_dwn_type , args.journal_filter, args.restrict, DOIs, args.scihub_mirror) From 811c7eac695be29a0ccd230b1e177ef93c7044bf Mon Sep 17 00:00:00 2001 From: "@suhan-paradkar" <12suhangp34@gmail.com> Date: Thu, 13 May 2021 15:25:08 +0530 Subject: [PATCH 05/13] Add proxy options --- PyPaperBot/__main__.py | 24 ++++++++++++++++-------- PyPaperBot/proxy.py | 8 ++++++++ requirements.txt | 1 + 3 files changed, 25 insertions(+), 8 deletions(-) create mode 100644 PyPaperBot/proxy.py diff --git a/PyPaperBot/__main__.py b/PyPaperBot/__main__.py index d2a0bd3..2e6d2ca 100644 --- a/PyPaperBot/__main__.py +++ b/PyPaperBot/__main__.py @@ -7,9 +7,12 @@ from .Downloader import downloadPapers from .Scholar import ScholarPapersInfo from .Crossref import getPapersInfoFromDOIs +from .proxy import proxy +def start(query, scholar_results, scholar_pages, dwn_dir, min_date=None, num_limit=None, num_limit_type=None, filter_jurnal_file=None, restrict=None, DOIs=None, SciHub_URL=None, host=None, port=None): -def start(query, scholar_results, scholar_pages, dwn_dir, min_date=None, num_limit=None, num_limit_type=None, filter_jurnal_file=None, restrict=None, DOIs=None, SciHub_URL=None): + if host!=None: + proxy(host, port) to_download = [] if DOIs==None: @@ -44,12 +47,11 @@ def start(query, scholar_results, scholar_pages, dwn_dir, min_date=None, num_lim downloadPapers(to_download, dwn_dir, num_limit, SciHub_URL) + Paper.generateReport(to_download,dwn_dir+"result.csv") Paper.generateBibtex(to_download,dwn_dir+"bibtex.bib") - - def main(): print("""PyPaperBot is a Python tool for downloading scientific papers using Google Scholar, Crossref and SciHub.\nIf you like this project, you can give me a cup of coffee at --> https://www.paypal.com/paypalme/ferru97 <-- :)\n""") @@ -68,27 +70,33 @@ def main(): parser.add_argument('--restrict', default=None, type=int ,choices=[0,1], help='0:Download only Bibtex - 1:Down load only papers PDF') parser.add_argument('--scihub-mirror', default=None, type=str, help='Mirror for downloading papers from sci-hub. If not set, it is selected automatically') parser.add_argument('--scholar-results', default=10, type=int, choices=[1,2,3,4,5,6,7,8,9,10], help='Downloads the first x results in a scholar page(max=10)') + parser.add_argument('--host', default=None, type=str, help='Use proxy host') + parser.add_argument('--port', default=None, type=int, help='Use proxy port') args = parser.parse_args() if args.query==None and args.doi_file==None and args.doi==None: print("Error, provide at least one of the following arguments: --query or --file") sys.exit() + if args.port!=None and args.host==None: + print("Error, Host not provided") + sys.exit() + if (args.query!=None and args.doi_file!=None) or (args.query!=None and args.doi!=None) or (args.doi!=None and args.doi_file!=None): print("Error: Only one option between '--query', '--doi-file' and '--doi' can be used") sys.exit() if args.scholar_results>10 or args.scholar_results<1: - print("Error: value of '--scholar-results' must be between 1 to 10") - sys.exit() + print("Error: value of '--scholar-results' must be between 1 to 10") + sys.exit() if args.dwn_dir==None: print("Error, provide the directory path in which to save the results") sys.exit() if args.scholar_results!=10 and args.scholar_pages>1: - print("Scholar results is applicable only for --scholar-pages=1 at this moment") - sys.exit() + print("Scholar results is applicable only for --scholar-pages=1 at this moment") + sys.exit() dwn_dir = args.dwn_dir.replace('\\', '/') if dwn_dir[len(dwn_dir)-1]!='/': @@ -143,7 +151,7 @@ def main(): max_dwn_type = 1 - start(args.query, scholar_results, scholar_pages, dwn_dir, args.min_year , max_dwn, max_dwn_type , args.journal_filter, args.restrict, DOIs, args.scihub_mirror) + start(args.query, scholar_results, scholar_pages, dwn_dir, args.min_year , max_dwn, max_dwn_type , args.journal_filter, args.restrict, DOIs, args.scihub_mirror, host, port) if __name__ == "__main__": main() diff --git a/PyPaperBot/proxy.py b/PyPaperBot/proxy.py new file mode 100644 index 0000000..e0244ad --- /dev/null +++ b/PyPaperBot/proxy.py @@ -0,0 +1,8 @@ +import proxy + +def proxy(host, port): +if __name__ == '__main__': +proxy.main([ + '--hostname', host, + '--port', port +]) diff --git a/requirements.txt b/requirements.txt index 3b20fee..a5944d8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,6 +13,7 @@ lazy-object-proxy>=1.4.3 mccabe>=0.6.1 numpy pandas +proxy.py>=2.0.0 pylint>=2.6.0 pyparsing>=2.4.7 python-dateutil>=2.8.1 From 66c52763052e3a67e615fdbab21722d2d39f99a9 Mon Sep 17 00:00:00 2001 From: suhan-paradkar <12suhangp34@gmail.com> Date: Sat, 15 May 2021 13:03:43 +0530 Subject: [PATCH 06/13] remove few errors --- PyPaperBot/Downloader.py | 4 ++-- PyPaperBot/Scholar.py | 4 ++-- PyPaperBot/proxy.py | 6 +++--- requirements.txt | 2 +- setup.py | 5 +++-- 5 files changed, 11 insertions(+), 10 deletions(-) diff --git a/PyPaperBot/Downloader.py b/PyPaperBot/Downloader.py index bc65e9f..a61a38e 100644 --- a/PyPaperBot/Downloader.py +++ b/PyPaperBot/Downloader.py @@ -22,7 +22,7 @@ def setSciHubUrl(): if found: print("\nUsing {} as Sci-Hub instance".format(NetInfo.SciHub_URL)) else: - print("\nNo working Sci-Hub instance found!\nIf in your country Sci-Hub is not available consider using a VPN") + print("\nNo working Sci-Hub instance found!\nIf in your country Sci-Hub is not available consider using a VPN or a proxy") NetInfo.SciHub_URL = "https://sci-hub.st" @@ -55,7 +55,7 @@ def URLjoin(*args): paper_number = 1 paper_files = [] for p in papers: - if p.canBeDownloaded() and (num_limit==None or num_downloaded {}".format(paper_number, len(papers), p.title)) paper_number += 1 diff --git a/PyPaperBot/Scholar.py b/PyPaperBot/Scholar.py index bfda0c6..6a16522 100644 --- a/PyPaperBot/Scholar.py +++ b/PyPaperBot/Scholar.py @@ -17,7 +17,7 @@ def waithIPchange(): time.sleep(30) return True -def scholar_requests(scholar_pages, url, restrict, scholar_results): +def scholar_requests(scholar_pages, url, restrict, scholar_results=10): javascript_error = "Sorry, we can't verify that you're not a robot when JavaScript is turned off" to_download = [] for i in scholar_pages: @@ -49,7 +49,7 @@ def scholar_requests(scholar_pages, url, restrict, scholar_results): -def ScholarPapersInfo(query, scholar_pages, restrict, min_date=None, scholar_results): +def ScholarPapersInfo(query, scholar_pages, restrict, min_date=None, scholar_results=10): url = r"https://scholar.google.com/scholar?hl=en&q="+query+"&as_vis=1&as_sdt=1,5&start=%d" if min_date!=None: diff --git a/PyPaperBot/proxy.py b/PyPaperBot/proxy.py index e0244ad..8533b1b 100644 --- a/PyPaperBot/proxy.py +++ b/PyPaperBot/proxy.py @@ -1,8 +1,8 @@ import proxy def proxy(host, port): -if __name__ == '__main__': -proxy.main([ + if __name__ == '__main__': + proxy.main([ '--hostname', host, '--port', port -]) + ]) diff --git a/requirements.txt b/requirements.txt index a5944d8..499b1c4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,7 +7,7 @@ colorama>=0.4.3 crossref-commons>=0.0.7 future>=0.18.2 HTMLParser>=0.0.2 -idna>=2.10 +idna>=2.10,<3 isort>=5.4.2 lazy-object-proxy>=1.4.3 mccabe>=0.6.1 diff --git a/setup.py b/setup.py index bbf39d1..2c08402 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ download_url = 'https://github.com/ferru97/PyPaperBot/archive/v1.1.1.tar.gz', keywords = ['download-papers','google-scholar', 'scihub', 'scholar', 'crossref', 'papers'], install_requires=[ - 'astroid>=2.4.2', + 'astroid>=2.4.2,<=2.5', 'beautifulsoup4>=4.9.1', 'bibtexparser>=1.2.0', 'certifi>=2020.6.20', @@ -26,12 +26,13 @@ 'crossref-commons>=0.0.7', 'future>=0.18.2', 'HTMLParser>=0.0.2', - 'idna>=2.10', + 'idna>=2.10,<3', 'isort>=5.4.2', 'lazy-object-proxy>=1.4.3', 'mccabe>=0.6.1', 'numpy', 'pandas', + 'proxy.py>=2.0.0', 'pylint>=2.6.0', 'pyparsing>=2.4.7', 'python-dateutil>=2.8.1', From 9ef2054a1814b5824de4f1aede18318eaf538a27 Mon Sep 17 00:00:00 2001 From: suhan-paradkar <12suhangp34@gmail.com> Date: Sat, 15 May 2021 16:10:52 +0530 Subject: [PATCH 07/13] remove more errors --- PyPaperBot/__main__.py | 23 +++++++++-------------- PyPaperBot/proxy.py | 22 ++++++++++++++-------- setup.py | 2 +- 3 files changed, 24 insertions(+), 23 deletions(-) diff --git a/PyPaperBot/__main__.py b/PyPaperBot/__main__.py index 2e6d2ca..dd78a1d 100644 --- a/PyPaperBot/__main__.py +++ b/PyPaperBot/__main__.py @@ -70,33 +70,28 @@ def main(): parser.add_argument('--restrict', default=None, type=int ,choices=[0,1], help='0:Download only Bibtex - 1:Down load only papers PDF') parser.add_argument('--scihub-mirror', default=None, type=str, help='Mirror for downloading papers from sci-hub. If not set, it is selected automatically') parser.add_argument('--scholar-results', default=10, type=int, choices=[1,2,3,4,5,6,7,8,9,10], help='Downloads the first x results in a scholar page(max=10)') - parser.add_argument('--host', default=None, type=str, help='Use proxy host') - parser.add_argument('--port', default=None, type=int, help='Use proxy port') + parser.add_argument('--proxy', nargs='*', default=[], help='Use proxychains, provide comma seperated list of proxies to use and please, no spaces') args = parser.parse_args() + pchain = [] + pchain = args.proxy + + proxy(pchain) + if args.query==None and args.doi_file==None and args.doi==None: print("Error, provide at least one of the following arguments: --query or --file") sys.exit() - if args.port!=None and args.host==None: - print("Error, Host not provided") - sys.exit() - if (args.query!=None and args.doi_file!=None) or (args.query!=None and args.doi!=None) or (args.doi!=None and args.doi_file!=None): print("Error: Only one option between '--query', '--doi-file' and '--doi' can be used") sys.exit() - if args.scholar_results>10 or args.scholar_results<1: - print("Error: value of '--scholar-results' must be between 1 to 10") - sys.exit() - if args.dwn_dir==None: print("Error, provide the directory path in which to save the results") sys.exit() - if args.scholar_results!=10 and args.scholar_pages>1: - print("Scholar results is applicable only for --scholar-pages=1 at this moment") - sys.exit() + if args.scholar_results!=10 and args.scholar_pages!=1: + print("Scholar results best applied along with --scholar-pages=1") dwn_dir = args.dwn_dir.replace('\\', '/') if dwn_dir[len(dwn_dir)-1]!='/': @@ -151,7 +146,7 @@ def main(): max_dwn_type = 1 - start(args.query, scholar_results, scholar_pages, dwn_dir, args.min_year , max_dwn, max_dwn_type , args.journal_filter, args.restrict, DOIs, args.scihub_mirror, host, port) + start(args.query, args.scholar_results, scholar_pages, dwn_dir, args.min_year , max_dwn, max_dwn_type , args.journal_filter, args.restrict, DOIs, args.scihub_mirror, host, port) if __name__ == "__main__": main() diff --git a/PyPaperBot/proxy.py b/PyPaperBot/proxy.py index 8533b1b..58ba1ea 100644 --- a/PyPaperBot/proxy.py +++ b/PyPaperBot/proxy.py @@ -1,8 +1,14 @@ -import proxy - -def proxy(host, port): - if __name__ == '__main__': - proxy.main([ - '--hostname', host, - '--port', port - ]) +import socket +import pyChainedProxy as socks +from .Downloader import downloadPapers + +def proxy(pchain): + + chain = pchain + + socks.setdefaultproxy() + for hop in chain: + socks.adddefaultproxy(*socks.parseproxy(hop)) + + rawsocket = socket.socket + socket.socket = socks.socksocket diff --git a/setup.py b/setup.py index 2c08402..1082356 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ 'mccabe>=0.6.1', 'numpy', 'pandas', - 'proxy.py>=2.0.0', + 'pyChainedProxy>=1.1', 'pylint>=2.6.0', 'pyparsing>=2.4.7', 'python-dateutil>=2.8.1', From e465da8e5d410a61c0c91ba020ea9529889ed2bd Mon Sep 17 00:00:00 2001 From: "@suhan-paradkar" <12suhangp34@gmail.com> Date: Sat, 15 May 2021 16:13:08 +0530 Subject: [PATCH 08/13] Add libgen --- PyPaperBot/Downloader.py | 14 +++++++++++++- PyPaperBot/HTMLparsers.py | 11 +++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/PyPaperBot/Downloader.py b/PyPaperBot/Downloader.py index bc65e9f..cb2ded8 100644 --- a/PyPaperBot/Downloader.py +++ b/PyPaperBot/Downloader.py @@ -1,7 +1,7 @@ from os import path import requests import time -from .HTMLparsers import getSchiHubPDF, SciHubUrls +from .HTMLparsers import getSchiHubPDF, SciHubUrls, LibgenUrls import random from .NetInfo import NetInfo @@ -25,6 +25,18 @@ def setSciHubUrl(): print("\nNo working Sci-Hub instance found!\nIf in your country Sci-Hub is not available consider using a VPN") NetInfo.SciHub_URL = "https://sci-hub.st" +def setLibgenUrl(): + r = requests.get(NetInfo.Libgen_URLs_repo, headers=NetInfo.HEADERS) + links = LibgenURLs(r.text) + found = False + + for l in links: + try: + r = requests.get(l, headers=NetInfo.HEADERS) + if r.status_code == 200: + found =True + NetInfo.Libgen_URL = 1 + break def getSaveDir(folder, fname): dir_ = path.join(folder, fname) diff --git a/PyPaperBot/HTMLparsers.py b/PyPaperBot/HTMLparsers.py index 937ce86..e553240 100644 --- a/PyPaperBot/HTMLparsers.py +++ b/PyPaperBot/HTMLparsers.py @@ -99,3 +99,14 @@ def SciHubUrls(html): return result +def LibgenUrls(html): + result = [] + soup = BeautifulSoup(html, "html.parser") + + for ul in soup.findall("ul"): + for a in ul.findAll("a"): + link = a.get("href") + if link.startswith("https://libgen.") or link.startswith("http://libgen"): + result.append(link) + + return result From 713544e4feebd3506b42722cf68226892e98f0bd Mon Sep 17 00:00:00 2001 From: suhan-paradkar <12suhangp34@gmail.com> Date: Sun, 16 May 2021 11:43:20 +0530 Subject: [PATCH 09/13] refine option --scholar-results --- PyPaperBot/Crossref.py | 55 ++++++++++++++++++++-------------------- PyPaperBot/Downloader.py | 4 +-- PyPaperBot/Scholar.py | 2 +- PyPaperBot/__main__.py | 7 ++--- 4 files changed, 33 insertions(+), 35 deletions(-) diff --git a/PyPaperBot/Crossref.py b/PyPaperBot/Crossref.py index 3b3f4c1..635200a 100644 --- a/PyPaperBot/Crossref.py +++ b/PyPaperBot/Crossref.py @@ -38,44 +38,45 @@ def getPapersInfoFromDOIs(DOI, restrict): #Get paper information from Crossref and return a list of Paper -def getPapersInfo(papers, scholar_search_link, restrict): +def getPapersInfo(papers, scholar_search_link, restrict, scholar_results): papers_return = [] num = 1 for paper in papers: - title = paper['title'] - queries = {'query.bibliographic': title.lower(),'sort':'relevance',"select":"DOI,title,deposited,author,short-container-title"} + while num <= scholar_results: + title = paper['title'] + queries = {'query.bibliographic': title.lower(),'sort':'relevance',"select":"DOI,title,deposited,author,short-container-title"} - print("Searching paper {} of {} on Crossref...".format(num,len(papers))) - num += 1 + print("Searching paper {} of {} on Crossref...".format(num,scholar_results)) + num += 1 - found_timestamp = 0 - paper_found = Paper(title,paper['link'],scholar_search_link, paper['cites'], paper['link_pdf'], paper['year'], paper['authors']) - while True: - try: - for el in iterate_publications_as_json(max_results=30, queries=queries): + found_timestamp = 0 + paper_found = Paper(title,paper['link'],scholar_search_link, paper['cites'], paper['link_pdf'], paper['year'], paper['authors']) + while True: + try: + for el in iterate_publications_as_json(max_results=30, queries=queries): - el_date = 0 - if "deposited" in el and "timestamp" in el["deposited"]: - el_date = int(el["deposited"]["timestamp"]) + el_date = 0 + if "deposited" in el and "timestamp" in el["deposited"]: + el_date = int(el["deposited"]["timestamp"]) - if (paper_found.DOI==None or el_date>found_timestamp) and "title" in el and similarStrings(title.lower() ,el["title"][0].lower())>0.75: - found_timestamp = el_date + if (paper_found.DOI==None or el_date>found_timestamp) and "title" in el and similarStrings(title.lower() ,el["title"][0].lower())>0.75: + found_timestamp = el_date - if "DOI" in el: - paper_found.DOI = el["DOI"].strip().lower() - if "short-container-title" in el and len(el["short-container-title"])>0: - paper_found.jurnal = el["short-container-title"][0] + if "DOI" in el: + paper_found.DOI = el["DOI"].strip().lower() + if "short-container-title" in el and len(el["short-container-title"])>0: + paper_found.jurnal = el["short-container-title"][0] - if restrict==None or restrict!=1: - paper_found.setBibtex(getBibtex(paper_found.DOI)) + if restrict==None or restrict!=1: + paper_found.setBibtex(getBibtex(paper_found.DOI)) - break - except ConnectionError as e: - print("Wait 10 seconds and try again...") - time.sleep(10) + break + except ConnectionError as e: + print("Wait 10 seconds and try again...") + time.sleep(10) - papers_return.append(paper_found) + papers_return.append(paper_found) - time.sleep(random.randint(1,10)) + time.sleep(random.randint(1,10)) return papers_return diff --git a/PyPaperBot/Downloader.py b/PyPaperBot/Downloader.py index a61a38e..4258df8 100644 --- a/PyPaperBot/Downloader.py +++ b/PyPaperBot/Downloader.py @@ -43,7 +43,7 @@ def saveFile(file_name,content, paper,dwn_source): paper.downloaded = True paper.downloadedFrom = dwn_source -def downloadPapers(papers, dwnl_dir, num_limit, SciHub_URL=None): +def downloadPapers(papers, dwnl_dir, num_limit, scholar_results, SciHub_URL=None): def URLjoin(*args): return "/".join(map(lambda x: str(x).rstrip('/'), args)) @@ -56,7 +56,7 @@ def URLjoin(*args): paper_files = [] for p in papers: if p.canBeDownloaded() and (num_limit==None or num_downloaded {}".format(paper_number, len(papers), p.title)) + print("Download {} of {} -> {}".format(paper_number, scholar_results, p.title)) paper_number += 1 pdf_dir = getSaveDir(dwnl_dir, p.getFileName()) diff --git a/PyPaperBot/Scholar.py b/PyPaperBot/Scholar.py index 6a16522..184b6a9 100644 --- a/PyPaperBot/Scholar.py +++ b/PyPaperBot/Scholar.py @@ -37,7 +37,7 @@ def scholar_requests(scholar_pages, url, restrict, scholar_results=10): print("\nGoogle Scholar page {} : {} papers found".format(i,scholar_results)) if(len(papers)>0): - papersInfo = getPapersInfo(papers, url, restrict) + papersInfo = getPapersInfo(papers, url, restrict, scholar_results) info_valids = functools.reduce(lambda a,b : a+1 if b.DOI!=None else a, papersInfo, 0) print("Papers found on Crossref: {}/{}\n".format(info_valids,len(papers))) diff --git a/PyPaperBot/__main__.py b/PyPaperBot/__main__.py index dd78a1d..7d09c03 100644 --- a/PyPaperBot/__main__.py +++ b/PyPaperBot/__main__.py @@ -9,10 +9,7 @@ from .Crossref import getPapersInfoFromDOIs from .proxy import proxy -def start(query, scholar_results, scholar_pages, dwn_dir, min_date=None, num_limit=None, num_limit_type=None, filter_jurnal_file=None, restrict=None, DOIs=None, SciHub_URL=None, host=None, port=None): - - if host!=None: - proxy(host, port) +def start(query, scholar_results, scholar_pages, dwn_dir, proxy, min_date=None, num_limit=None, num_limit_type=None, filter_jurnal_file=None, restrict=None, DOIs=None, SciHub_URL=None): to_download = [] if DOIs==None: @@ -146,7 +143,7 @@ def main(): max_dwn_type = 1 - start(args.query, args.scholar_results, scholar_pages, dwn_dir, args.min_year , max_dwn, max_dwn_type , args.journal_filter, args.restrict, DOIs, args.scihub_mirror, host, port) + start(args.query, args.scholar_results, scholar_pages, dwn_dir, proxy, args.min_year , max_dwn, max_dwn_type , args.journal_filter, args.restrict, DOIs, args.scihub_mirror) if __name__ == "__main__": main() From 4ef3bb007711e683b832013cdfa8a73c6f9eb91c Mon Sep 17 00:00:00 2001 From: "@suhan-paradkar" <12suhangp34@gmail.com> Date: Sun, 23 May 2021 10:55:45 +0530 Subject: [PATCH 10/13] remove description-file warning given while running setup.py install --- setup.cfg | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.cfg b/setup.cfg index f814226..aea9a7f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] -description-file = README.md +description_file = README.md [options.entry_points] console_scripts = - PyPaperBot = PyPaperBot.__main__:main \ No newline at end of file + PyPaperBot = PyPaperBot.__main__:main From 20dae4b0dc2cdc885b6a2f3faac789c66c59cbca Mon Sep 17 00:00:00 2001 From: suhan-paradkar <12suhangp34@gmail.com> Date: Mon, 24 May 2021 08:49:55 +0530 Subject: [PATCH 11/13] Revert "Add libgen" This reverts commit e465da8e5d410a61c0c91ba020ea9529889ed2bd. --- PyPaperBot/Downloader.py | 14 +------------- PyPaperBot/HTMLparsers.py | 11 ----------- 2 files changed, 1 insertion(+), 24 deletions(-) diff --git a/PyPaperBot/Downloader.py b/PyPaperBot/Downloader.py index c9261c0..4258df8 100644 --- a/PyPaperBot/Downloader.py +++ b/PyPaperBot/Downloader.py @@ -1,7 +1,7 @@ from os import path import requests import time -from .HTMLparsers import getSchiHubPDF, SciHubUrls, LibgenUrls +from .HTMLparsers import getSchiHubPDF, SciHubUrls import random from .NetInfo import NetInfo @@ -25,18 +25,6 @@ def setSciHubUrl(): print("\nNo working Sci-Hub instance found!\nIf in your country Sci-Hub is not available consider using a VPN or a proxy") NetInfo.SciHub_URL = "https://sci-hub.st" -def setLibgenUrl(): - r = requests.get(NetInfo.Libgen_URLs_repo, headers=NetInfo.HEADERS) - links = LibgenURLs(r.text) - found = False - - for l in links: - try: - r = requests.get(l, headers=NetInfo.HEADERS) - if r.status_code == 200: - found =True - NetInfo.Libgen_URL = 1 - break def getSaveDir(folder, fname): dir_ = path.join(folder, fname) diff --git a/PyPaperBot/HTMLparsers.py b/PyPaperBot/HTMLparsers.py index e553240..937ce86 100644 --- a/PyPaperBot/HTMLparsers.py +++ b/PyPaperBot/HTMLparsers.py @@ -99,14 +99,3 @@ def SciHubUrls(html): return result -def LibgenUrls(html): - result = [] - soup = BeautifulSoup(html, "html.parser") - - for ul in soup.findall("ul"): - for a in ul.findAll("a"): - link = a.get("href") - if link.startswith("https://libgen.") or link.startswith("http://libgen"): - result.append(link) - - return result From f476bdc3636a7a60c45f458a9e00b5c1a366edbe Mon Sep 17 00:00:00 2001 From: suhan-paradkar <12suhangp34@gmail.com> Date: Mon, 24 May 2021 09:55:08 +0530 Subject: [PATCH 12/13] refine proxies --- PyPaperBot/__main__.py | 3 +-- README.md | 12 +++++++++++- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/PyPaperBot/__main__.py b/PyPaperBot/__main__.py index 7d09c03..d6bd046 100644 --- a/PyPaperBot/__main__.py +++ b/PyPaperBot/__main__.py @@ -67,12 +67,11 @@ def main(): parser.add_argument('--restrict', default=None, type=int ,choices=[0,1], help='0:Download only Bibtex - 1:Down load only papers PDF') parser.add_argument('--scihub-mirror', default=None, type=str, help='Mirror for downloading papers from sci-hub. If not set, it is selected automatically') parser.add_argument('--scholar-results', default=10, type=int, choices=[1,2,3,4,5,6,7,8,9,10], help='Downloads the first x results in a scholar page(max=10)') - parser.add_argument('--proxy', nargs='*', default=[], help='Use proxychains, provide comma seperated list of proxies to use and please, no spaces') + parser.add_argument('--proxy', nargs='+', default=[], help='Use proxychains, provide a seperated list of proxies to use.Please specify the argument al the end') args = parser.parse_args() pchain = [] pchain = args.proxy - proxy(pchain) if args.query==None and args.doi_file==None and args.doi==None: diff --git a/README.md b/README.md index 6d6111d..8abc36e 100644 --- a/README.md +++ b/README.md @@ -60,6 +60,7 @@ PyPaperBot arguments: | \-\-restrict | 0:Download only Bibtex - 1:Down load only papers PDF | int | | \-\-scihub-mirror | Mirror for downloading papers from sci-hub. If not set, it is selected automatically | string | | \-\-scholar-results| Number of scholar results to bedownloaded when \-\-scholar-pages=1 | int | +| \-\-proxy | Proxies to be used. Please specify the protocol to be used. | string | | \-h | Shows the help | -- | ### Note @@ -77,9 +78,12 @@ The argument *\-\-journal-filter* require the path of a CSV containing a list o The argument *\-\-doi-file* require the path of a txt file containing the list of paper's DOIs to download organized with one DOI per line [Example](https://github.com/ferru97/PyPaperBot/blob/master/file_examples/papers.txt) +Use the --proxy argument at the end of all other arguments and specify the protocol to be used. See the examples to understand how to use the option. + ## SciHub access -If access to SciHub is blocked in your country, consider using a free VPN service like [ProtonVPN](https://protonvpn.com/) +If access to SciHub is blocked in your country, consider using a free VPN service like [ProtonVPN](https://protonvpn.com/) +Also, you can use proxy option above. ## Example @@ -113,6 +117,12 @@ If it doesn't work, try to use *py* instead of *python* i.e. py -m PyPaperBot --doi="10.0086/s41037-711-0132-1" --dwn-dir="C:\User\example\papers"` ``` +Using a proxy + +``` +python -m PyPaperBot --query=rheumatoid+arthritis --scholar-pages=1 --scholar-results=7 --dwn-dir=/download --proxy http://1.1.1.1::8080 https://8.8.8.8::8080 +``` + In termux, you can directly use ```PyPaperBot``` followed by arguments... ## Contributions From c54914c24b18a769410bed97719a90e68a7c0c0b Mon Sep 17 00:00:00 2001 From: Vito Ferrulli Date: Wed, 2 Jun 2021 11:16:25 +0200 Subject: [PATCH 13/13] v1.2 out --- PyPaperBot/__init__.py | 2 +- setup.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/PyPaperBot/__init__.py b/PyPaperBot/__init__.py index 4be8728..f33b6fd 100644 --- a/PyPaperBot/__init__.py +++ b/PyPaperBot/__init__.py @@ -1 +1 @@ -__version__= "1.1.1" +__version__= "1.2" diff --git a/setup.py b/setup.py index 1082356..6cd21fb 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ setuptools.setup( name = 'PyPaperBot', packages = setuptools.find_packages(), - version = '1.1.1', + version = '1.2', license='MIT', description = 'PyPaperBot is a Python tool for downloading scientific papers using Google Scholar, Crossref, and SciHub.', long_description=long_description, @@ -14,7 +14,7 @@ author = 'Vito Ferrulli', author_email = 'vitof970@gmail.com', url = 'https://github.com/ferru97/PyPaperBot', - download_url = 'https://github.com/ferru97/PyPaperBot/archive/v1.1.1.tar.gz', + download_url = 'https://github.com/ferru97/PyPaperBot/archive/v1.2.tar.gz', keywords = ['download-papers','google-scholar', 'scihub', 'scholar', 'crossref', 'papers'], install_requires=[ 'astroid>=2.4.2,<=2.5',