-
Notifications
You must be signed in to change notification settings - Fork 0
/
spider.py
89 lines (76 loc) · 3.69 KB
/
spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
from urllib.request import urlopen
from urllib import robotparser
from link_finder import LinkFinder
from general import *
class Spider:
#class variables (share among all instances)
project_name =''#The name of the project (and the directory where all files are stored)
base_url =''#(i.e. the homepage url in order to make relative paths work)
domain_name =''#want to make sure we are connecting to a valid domain
#these are the textfiles
queue_file =''#(i.e. thenewboston/queue.txt)
crawled_file =''
#these are the variables - they are in RAM as opposed to HDD and much faster to access
queue =set()
crawled =set()
rp = robotparser.RobotFileParser() #make sure we are allowed to add links
#ctor/initializer thingy
#domain_name allows us to use a bunch of cool domain name
#fxns to ensure we are connecting to a valid webpage w/o errors
def __init__(self, project_name, base_url, domain_name):
Spider.project_name =project_name
Spider.base_url =base_url
Spider.domain_name =domain_name
Spider.queue_file =Spider.project_name +'/queue.txt'
Spider.crawled_file =Spider.project_name +'/crawled.txt'
self.boot()
self.rp.set_url('http://' +self.domain_name +'/robots.txt')
self.rp.read()
self.crawl_page('First spider', Spider.base_url)
@staticmethod #This is a little indicator to python to say, hey this is a static method, (make PEP8 happy)
def boot(): # which also means we do not need to pass self in.
create_project_dir(Spider.project_name)#create directory
create_data_files(Spider.project_name, Spider.base_url)#create the .txt files
Spider.queue =file_to_set(Spider.queue_file)
Spider.crawled =file_to_set(Spider.crawled_file)
@staticmethod
def crawl_page(thread_name, page_url):
if page_url not in Spider.crawled:
print(thread_name +' crawling ' +page_url)
print( 'Queue ' + str(len(Spider.queue)) +' | Crawled ' +str(len(Spider.crawled)) )
Spider.add_links_to_queue(Spider.gather_links(page_url))
Spider.queue.remove(page_url)
Spider.crawled.add(page_url)
Spider.update_files()#after we have done our fast operations w/ our variables, now we can update files on HDD
@staticmethod
def gather_links(page_url): #need to convert response we get back from zero in bytes to a string,
html_string ='' # so our python method can actually use it
try:
response =urlopen(page_url)
if response.getheader('Content-Type') == 'text/html':
html_bytes = response.read()
html_string = html_bytes.decode("utf-8")
findy = LinkFinder(Spider.base_url, page_url)
findy.feed(html_string)
except:
print ('Error: cannot crawl page')
return set()
crawlable_links = set()
for url in findy.page_links():
if Spider.rp.can_fetch("*", url):
crawlable_links.add(url)
return crawlable_links
@staticmethod
def add_links_to_queue(links):
for url in links:
if url in Spider.queue:
continue #go to next iteration of loop if we already have url in queue or set
if url in Spider.crawled:
continue
if Spider.domain_name not in url: #This means only crawl the site we specified
continue #(not the whole internet)
Spider.queue.add(url)
@staticmethod
def update_files():
set_to_file(Spider.queue, Spider.queue_file)
set_to_file(Spider.crawled, Spider.crawled_file)