Skip to content

Commit

Permalink
https://github.com/BackofenLab/AxoWise/commit/3370bf163dce06294e64f9d…
Browse files Browse the repository at this point in the history
…ba1b631f92ea616b0
  • Loading branch information
mominaatifdar committed Nov 21, 2024
1 parent 3370bf1 commit 6143812
Showing 1 changed file with 44 additions and 22 deletions.
66 changes: 44 additions & 22 deletions scraping/scraping/getHtml.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,12 @@
from bs4 import BeautifulSoup
import time
import re
import urllib.parse

def setup_chrome_driver():
"""set up Chrome driver"""
chrome_options = Options()
chrome_options.add_argument('--headless')
# chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

Expand All @@ -23,37 +24,58 @@ def setup_chrome_driver():

def scrape_publications():
"""scrapes the full HTML content with infinite scroll"""
journals = ['Nature', 'Science','Cell', 'Immunity', 'Circulation',
'Gastroenterology','Gut', 'Neuro-Oncology','Cancer Cell', 'Cell Metabolism',
'Nature Immunology', 'Nature Biotechnology', 'Nature Medicine', 'Nature Genetics', 'Nature Cell Biology',
'Nature Neuroscience', 'Nature Cancer', 'Nature Methods', 'Nature Metabolism',
'Nature Microbiology', 'Nature Nanotechnology', 'Science Immunology', 'Science Bulletin',
'Cancer Discovery', 'Cell Research', 'Bioactive Materials', 'Molecular Cancer',
'Molecular Neurodegeneration','Cell Stem Cell','Cell Host & Microbe', 'Nature Cell Biology',
'Nature Biomedical Engineering','Cellular & Molecular Immunology','The Lancet Microbe',
'The Lancet Oncology','Science Translational Medicine','Nucleic Acids Research',
'National Science Review','Journal of Hepatology','Military Medical Research',
'The Lancet Infectious Diseases','Signal Transduction and Targeted Therapy','Annals of Rheumatic Diseases',
'Journal of Hematology and Oncology']

base_url = "https://www.10xgenomics.com/publications?refinementList%5Bspecies%5D%5B0%5D=Human&page=1" #scrape publications related to Human species only

driver = setup_chrome_driver()
for i in range(0, len(journals),3):

try:
driver.get(base_url)
driver = setup_chrome_driver()
base_url = f"https://www.10xgenomics.com/publications?page=1&sortBy=master%3Apublications&query="

last_height = driver.execute_script("return document.body.scrollHeight") # to get the initial height of page
chunk = journals[i:i+3]
file_name = ""
for count, j in enumerate(chunk):
file_name += "_"+j
base_url = base_url + f"&refinementList%5Bjournal%5D%5B{count}%5D={urllib.parse.quote(j)}"

while True:
print(base_url)

driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") # scroll down to the bottom
try:
driver.get(base_url)

time.sleep(1)
last_height = driver.execute_script("return document.body.scrollHeight") # to get the initial height of page

new_height = driver.execute_script("return document.body.scrollHeight") # height of new page with more content
while True:

publication_elements = driver.find_element(By.CLASS_NAME, "PublicationSearch") # find and extract HTML
html_content = publication_elements.get_attribute("outerHTML")
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") # scroll down to the bottom

with open("publications_humans.html", "w", encoding="utf-8") as f:
f.write(html_content + "\n")
time.sleep(1.5)

if new_height == last_height: # no change in height, all content is loaded
print("Reached the end of the page.")
break
last_height = new_height
new_height = driver.execute_script("return document.body.scrollHeight") # height of new page with more content

finally:
driver.quit()
publication_elements = driver.find_element(By.CLASS_NAME, "PublicationSearch") # find and extract HTML
html_content = publication_elements.get_attribute("outerHTML")

with open(f"{file_name}.html", "a", encoding="utf-8") as f:
f.write(html_content + "\n")

if new_height == last_height: # no change in height, all content is loaded
print("Reached the end of the page.")
break
last_height = new_height

finally:
driver.quit()

def get_doi():
"""extracts DOI URLs from .html"""
Expand All @@ -75,4 +97,4 @@ def get_doi():

if __name__ == "__main__":
scrape_publications()
get_doi()
# get_doi()

0 comments on commit 6143812

Please sign in to comment.