https://github.com/BackofenLab/AxoWise/commit/3370bf163dce06294e64f9d…

…ba1b631f92ea616b0
BackofenLab · Nov 21, 2024 · 6143812 · 6143812
1 parent 3370bf1
commit 6143812
Showing 1 changed file with 44 additions and 22 deletions.
diff --git a/scraping/scraping/getHtml.py b/scraping/scraping/getHtml.py
@@ -6,11 +6,12 @@
 from bs4 import BeautifulSoup
 import time
 import re
+import urllib.parse
 
 def setup_chrome_driver():
     """set up Chrome driver"""
     chrome_options = Options()
-    chrome_options.add_argument('--headless') 
+    # chrome_options.add_argument('--headless') 
     chrome_options.add_argument('--no-sandbox')
     chrome_options.add_argument('--disable-dev-shm-usage')
 
@@ -23,37 +24,58 @@ def setup_chrome_driver():
 
 def scrape_publications():
     """scrapes the full HTML content with infinite scroll"""
+    journals = ['Nature', 'Science','Cell', 'Immunity', 'Circulation', 
+                'Gastroenterology','Gut', 'Neuro-Oncology','Cancer Cell', 'Cell Metabolism', 
+                'Nature Immunology', 'Nature Biotechnology', 'Nature Medicine', 'Nature Genetics', 'Nature Cell Biology', 
+                'Nature Neuroscience', 'Nature Cancer', 'Nature Methods', 'Nature Metabolism', 
+                'Nature Microbiology', 'Nature Nanotechnology', 'Science Immunology', 'Science Bulletin', 
+                'Cancer Discovery', 'Cell Research', 'Bioactive Materials', 'Molecular Cancer', 
+                'Molecular Neurodegeneration','Cell Stem Cell','Cell Host & Microbe', 'Nature Cell Biology',
+                'Nature Biomedical Engineering','Cellular & Molecular Immunology','The Lancet Microbe',
+                'The Lancet Oncology','Science Translational Medicine','Nucleic Acids Research',
+                'National Science Review','Journal of Hepatology','Military Medical Research',
+                'The Lancet Infectious Diseases','Signal Transduction and Targeted Therapy','Annals of Rheumatic Diseases',
+                'Journal of Hematology and Oncology']
 
-    base_url = "https://www.10xgenomics.com/publications?refinementList%5Bspecies%5D%5B0%5D=Human&page=1"   #scrape publications related to Human species only
-
-    driver = setup_chrome_driver()
+    for i in range(0, len(journals),3):
 
-    try:
-        driver.get(base_url)
+        driver = setup_chrome_driver()
+        base_url = f"https://www.10xgenomics.com/publications?page=1&sortBy=master%3Apublications&query="
 
-        last_height = driver.execute_script("return document.body.scrollHeight")    # to get the initial height of page
+        chunk = journals[i:i+3]
+        file_name = ""
+        for count, j in enumerate(chunk):
+            file_name += "_"+j
+            base_url = base_url + f"&refinementList%5Bjournal%5D%5B{count}%5D={urllib.parse.quote(j)}"
 
-        while True:
+        print(base_url)
 
-            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")    # scroll down to the bottom
+        try:
+            driver.get(base_url)
 
-            time.sleep(1)
+            last_height = driver.execute_script("return document.body.scrollHeight")    # to get the initial height of page
 
-            new_height = driver.execute_script("return document.body.scrollHeight") # height of new page with more content
+            while True:
 
-            publication_elements = driver.find_element(By.CLASS_NAME, "PublicationSearch")  # find and extract HTML
-            html_content = publication_elements.get_attribute("outerHTML")
+                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")    # scroll down to the bottom
 
-            with open("publications_humans.html", "w", encoding="utf-8") as f:
-                f.write(html_content + "\n")
+                time.sleep(1.5)
 
-            if new_height == last_height:   # no change in height, all content is loaded
-                print("Reached the end of the page.")
-                break
-            last_height = new_height
+                new_height = driver.execute_script("return document.body.scrollHeight") # height of new page with more content
 
-    finally:
-        driver.quit()
+                publication_elements = driver.find_element(By.CLASS_NAME, "PublicationSearch")  # find and extract HTML
+                html_content = publication_elements.get_attribute("outerHTML")
+
+                with open(f"{file_name}.html", "a", encoding="utf-8") as f:
+                    f.write(html_content + "\n")
+
+                if new_height == last_height:   # no change in height, all content is loaded
+                    print("Reached the end of the page.")
+                    break
+                last_height = new_height
+
+        finally:
+            driver.quit()
 
 def get_doi():
     """extracts DOI URLs from .html"""
@@ -75,4 +97,4 @@ def get_doi():
 
 if __name__ == "__main__":
     scrape_publications()
-    get_doi()
+    # get_doi()