-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
122 lines (101 loc) · 4.29 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import requests
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def setup_driver():
"""Set up Chrome in headless mode"""
chrome_options = Options()
chrome_options.add_argument("--headless") # Run in headless mode
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
return webdriver.Chrome(options=chrome_options)
def get_links(url):
"""Fetch webpage and extract all links."""
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
links = []
for link in soup.find_all('a', href=True):
href = link['href']
if href and not href.startswith(('javascript:', '#', 'mailto:')):
if not href.startswith(('http://', 'https://')):
if href.startswith('/'):
href = '/'.join(url.split('/')[:3]) + href
else:
href = url.rstrip('/') + '/' + href
links.append(href)
print("\nFound links:")
for i, link in enumerate(links, 1):
print(f"{i}. {link}")
return links
except requests.RequestException as e:
print(f"Error fetching page: {e}")
return []
def parse_selection(selection, max_index):
"""Parse user selection string into list of indices."""
indices = set()
parts = selection.replace(' ', '').split(',')
for part in parts:
if '-' in part:
try:
start, end = map(int, part.split('-'))
indices.update(range(start, end + 1))
except ValueError:
print(f"Invalid range: {part}")
else:
try:
indices.add(int(part))
except ValueError:
print(f"Invalid number: {part}")
valid_indices = {i for i in indices if 1 <= i <= max_index}
return sorted(list(valid_indices))
def download_pages(links, selected_indices):
"""Download content from selected links and save to file."""
driver = setup_driver()
with open('downloaded_content.txt', 'w', encoding='utf-8') as f:
for index in selected_indices:
url = links[index - 1]
print(f"\nDownloading: {url}")
try:
driver.get(url)
# Wait for content to load
time.sleep(5) # Give JavaScript time to run
# Write page header
f.write(f"\n{'='*50}\n{url}\n{'='*50}\n\n")
# Get the main content
try:
# Wait for content to be present
main_content = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "main, article, .content, .post-content"))
)
content = main_content.text
except:
# If no specific content container found, get all body text
content = driver.find_element(By.TAG_NAME, "body").text
f.write(content + '\n\n')
except Exception as e:
f.write(f"Error downloading {url}: {e}\n\n")
print(f"Error downloading {url}: {e}")
# Be nice to the server
time.sleep(2)
driver.quit()
print("\nAll content has been saved to 'downloaded_content.txt'")
def main():
url = input("Enter the webpage URL: ")
links = get_links(url)
if links:
print("\nEnter the numbers of links to download (e.g., '1-6, 12, 15, 22-36')")
selection = input("Selection: ")
selected_indices = parse_selection(selection, len(links))
if selected_indices:
print(f"\nWill download {len(selected_indices)} pages...")
download_pages(links, selected_indices)
else:
print("No valid indices selected.")
if __name__ == "__main__":
main()