-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrap.py
129 lines (105 loc) · 3.79 KB
/
scrap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from tkinter import scrolledtext
from selenium import webdriver
import tkinter as tk
import time
import sys
import os
class Redirect:
def __init__(self, text_widget):
self.output = text_widget
def write(self, string):
self.output.insert(tk.END, string)
self.output.see(tk.END) # Scroll to the bottom
def flush(self):
pass # Needed for file-like compatibility
# Fenster erstellen
root = tk.Tk()
root.title("Web Scraper")
root.geometry("300x500+0+0")
# URL Eingabe
url_label = tk.Label(root, text="Start URL:")
url_label.pack()
url_entry = tk.Entry(root, width=30)
url_entry.insert(0, "https://www.startpage.com/")
url_entry.pack(padx=10)
# Suchbegriff
search_label = tk.Label(root, text="Suchbegriff:")
search_label.pack()
search_entry = tk.Entry(root, width=30)
search_entry.insert(0, "hi")
search_entry.pack(padx=10)
# Seitenzahl
page_label = tk.Label(root, text="Seitenanzahl:")
page_label.pack()
page_entry = tk.Spinbox(root, from_=1, to=100, width=5) # Setzt den Bereich von 1 bis 100
page_entry.pack(padx=10)
collected_urls = []
# Scraperlogik
def start_scraping():
pages_to_scrape = int(page_entry.get())
driver = webdriver.Firefox()
driver.set_window_size(1000, 600)
driver.set_window_position(301, 0)
driver.get(url_entry.get())
time.sleep(3)
search_field = driver.find_element(By.ID, 'q')
search_field.send_keys(search_entry.get())
search_field.submit()
time.sleep(10)
for page_number in range(pages_to_scrape):
print(f"Scraping Seite {page_number + 1}")
root.update() # GUI aktualisieren
elements = driver.find_elements(By.CSS_SELECTOR, 'a.w-gl__result-title.result-link')
for element in elements:
collected_urls.append(element.get_attribute('href'))
#time.sleep(1)
if page_number < pages_to_scrape - 1:
try:
time.sleep(1)
driver.execute_script("window.scrollBy(0, 3500)")
time.sleep(1)
next_page = driver.find_element(By.CSS_SELECTOR, 'button.pagination__next-prev-button.next')
next_page.click()
print(f"Navigation zur Seite {page_number + 2} .")
root.update() # GUI aktualisieren
except Exception as e:
print(f"Fehler beim Wechseln zur Seite {page_number + 2}: {e}")
break
time.sleep(3)
driver.quit()
print("Scraping abgeschlossen.")
root.update() # GUI aktualisieren
# Resetbutton
def restart_app():
root.destroy() # Schließt das aktuelle Fenster
os.execl(sys.executable, sys.executable, *sys.argv) # Startet das Programm neu
# Speicherlogik
def save_results():
new_folder = '/home/mint/Schreibtisch/projekt/results'
if not os.path.exists(new_folder):
os.makedirs(new_folder)
filename = f"{search_entry.get()}_{len(collected_urls)}.txt"
file_path = os.path.join(new_folder, filename)
with open(file_path, 'w') as file:
for url in collected_urls:
file.write(url + '\n')
print(f"Ergebnisse gespeichert in: {filename}")
# Startknopf
start_button = tk.Button(root, text="Start Scraping", command=start_scraping)
start_button.pack(pady=3)
# Speicherknopf
speicher_button = tk.Button(root, text="Save Results", command=save_results)
speicher_button.pack(pady=2)
# Textfeld für die Ausgabe
output_text = scrolledtext.ScrolledText(root, height=10)
output_text.pack(padx=10, pady=10)
# Umleitung der Standard-Ausgabe und Fehlerausgabe
sys.stdout = Redirect(output_text)
sys.stderr = Redirect(output_text)
# Reset-Button
reset_button = tk.Button(root, text="Reset", command=restart_app)
reset_button.pack(pady=2)
# Fenster starten
root.mainloop()