-
Notifications
You must be signed in to change notification settings - Fork 0
/
search.py
66 lines (57 loc) · 1.89 KB
/
search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#!usr/bin/env python3
"""
Search module.
"""
import requests as Requests
from bs4 import BeautifulSoup
class Search:
"""
Class to searching a web page and getting retrieving data.
"""
def __init__(self, webpage):
"""
Initialization method.
"""
self.webpage = webpage
def get_links(self):
"""
Get all href links on self.webpage
"""
try:
request = Requests.get(self.webpage)
except Requests.exceptions.RequestException as _e:
print(_e)
return []
data = request.text
soup = BeautifulSoup(data, "html.parser")
all_links_found = []
for link in soup.find_all("a"):
if link.has_attr("href"):
href = str(link.get("href")).lower()
if href == "/":
pass
if href.startswith("/"):
while href.startswith("//"):
href = href[1:]
all_links_found.append(self.webpage + href)
elif href.startswith("http"):
all_links_found.append(href)
elif href.startswith("tel:"):
pass
elif href.startswith("javascript:"):
pass
elif href.startswith("mailto:"):
pass
elif href.startswith("#"):
pass
elif href.startswith("?"):
pass
elif href.startswith(" ") or href == "":
pass
elif href.endswith(".html") or href.endswith(".htm"):
all_links_found.append(self.webpage + "/" + href)
elif "." not in href:
all_links_found.append(self.webpage + "/" + href)
else:
print(href)
return all_links_found