diff --git a/scraping/scraping/getHtml.py b/scraping/scraping/getHtml.py index 6b574d9..681b43b 100644 --- a/scraping/scraping/getHtml.py +++ b/scraping/scraping/getHtml.py @@ -6,11 +6,12 @@ from bs4 import BeautifulSoup import time import re +import urllib.parse def setup_chrome_driver(): """set up Chrome driver""" chrome_options = Options() - chrome_options.add_argument('--headless') + # chrome_options.add_argument('--headless') chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-dev-shm-usage') @@ -23,37 +24,58 @@ def setup_chrome_driver(): def scrape_publications(): """scrapes the full HTML content with infinite scroll""" + journals = ['Nature', 'Science','Cell', 'Immunity', 'Circulation', + 'Gastroenterology','Gut', 'Neuro-Oncology','Cancer Cell', 'Cell Metabolism', + 'Nature Immunology', 'Nature Biotechnology', 'Nature Medicine', 'Nature Genetics', 'Nature Cell Biology', + 'Nature Neuroscience', 'Nature Cancer', 'Nature Methods', 'Nature Metabolism', + 'Nature Microbiology', 'Nature Nanotechnology', 'Science Immunology', 'Science Bulletin', + 'Cancer Discovery', 'Cell Research', 'Bioactive Materials', 'Molecular Cancer', + 'Molecular Neurodegeneration','Cell Stem Cell','Cell Host & Microbe', 'Nature Cell Biology', + 'Nature Biomedical Engineering','Cellular & Molecular Immunology','The Lancet Microbe', + 'The Lancet Oncology','Science Translational Medicine','Nucleic Acids Research', + 'National Science Review','Journal of Hepatology','Military Medical Research', + 'The Lancet Infectious Diseases','Signal Transduction and Targeted Therapy','Annals of Rheumatic Diseases', + 'Journal of Hematology and Oncology'] - base_url = "https://www.10xgenomics.com/publications?refinementList%5Bspecies%5D%5B0%5D=Human&page=1" #scrape publications related to Human species only - - driver = setup_chrome_driver() + for i in range(0, len(journals),3): - try: - driver.get(base_url) + driver = setup_chrome_driver() + base_url = f"https://www.10xgenomics.com/publications?page=1&sortBy=master%3Apublications&query=" - last_height = driver.execute_script("return document.body.scrollHeight") # to get the initial height of page + chunk = journals[i:i+3] + file_name = "" + for count, j in enumerate(chunk): + file_name += "_"+j + base_url = base_url + f"&refinementList%5Bjournal%5D%5B{count}%5D={urllib.parse.quote(j)}" - while True: + print(base_url) - driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") # scroll down to the bottom + try: + driver.get(base_url) - time.sleep(1) + last_height = driver.execute_script("return document.body.scrollHeight") # to get the initial height of page - new_height = driver.execute_script("return document.body.scrollHeight") # height of new page with more content + while True: - publication_elements = driver.find_element(By.CLASS_NAME, "PublicationSearch") # find and extract HTML - html_content = publication_elements.get_attribute("outerHTML") + driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") # scroll down to the bottom - with open("publications_humans.html", "w", encoding="utf-8") as f: - f.write(html_content + "\n") + time.sleep(1.5) - if new_height == last_height: # no change in height, all content is loaded - print("Reached the end of the page.") - break - last_height = new_height + new_height = driver.execute_script("return document.body.scrollHeight") # height of new page with more content - finally: - driver.quit() + publication_elements = driver.find_element(By.CLASS_NAME, "PublicationSearch") # find and extract HTML + html_content = publication_elements.get_attribute("outerHTML") + + with open(f"{file_name}.html", "a", encoding="utf-8") as f: + f.write(html_content + "\n") + + if new_height == last_height: # no change in height, all content is loaded + print("Reached the end of the page.") + break + last_height = new_height + + finally: + driver.quit() def get_doi(): """extracts DOI URLs from .html""" @@ -75,4 +97,4 @@ def get_doi(): if __name__ == "__main__": scrape_publications() - get_doi() + # get_doi()