forked from yoconana/Information-Retrieval
-
Notifications
You must be signed in to change notification settings - Fork 0
/
mycrawler.py
142 lines (133 loc) · 3.83 KB
/
mycrawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#_*_coding: utf-8 _*_
import urllib2, urllib, urlparse
import os, sys
from HTMLParser import HTMLParser
import threading, Queue
reload(sys)
sys.setdefaultencoding('utf-8')
# get the domain name: not in url, then dismiss this url
# referenced to:https://www.youtube.com/watch?v=nRW90GASSXE&list=PL6gx4Cwl9DGA8Vys-f48mAH9OKSUyav0q
def get_net_location(url):
find_net_loc = urlparse.urlparse(url).netloc
url_split = find_net_loc.split('.')
# return ku.edu only
domain_name = url_split[-2] + '.' + url_split[-1]
return domain_name
class MyHTMLParser(HTMLParser):
def __init__(self, hmpage, domain):
HTMLParser.__init__(self)
self.hmpage = hmpage
self.domain = domain
self.page_links = set()
def handle_starttag(self, tag, attrs):
if cmp(tag, 'a') == 0:
for (key, value) in attrs:
if cmp(key, 'href') == 0:
links = urlparse.urljoin(self.hmpage, value)
self.page_links.add(links)
def get_page_html(self, url):
global queue
global ToCrawl
global HasCrawled
try:
pageReq = urllib2.Request(url)
req_header = urllib2.urlopen(pageReq).info().getheader('Content-Type')
if 'text/html' in req_header:
pageHtml = urllib2.urlopen(url).read()# after reload(sys), no need to use read().decode('utf-8')
self.feed(pageHtml)
for pglink in self.page_links:
if (pglink in ToCrawl) or (pglink in HasCrawled):
continue
try:
if self.domain != get_net_location(pglink):
continue
except:
print "Can't find domain name in current url!"
continue
if 'mailto' in pglink:
continue
if 'tel:' in pglink:
continue
if '.pdf' in pglink:
continue
queue.put(pglink)
ToCrawl.add(pglink)
return pageHtml
else:
print "It's not url for text or html."
return " "
except Exception as e:
print str(e)
def handle_endtag(self, tag):
pass
# The spider function
def MySpider(url):
global queue
global ToCrawl
global HasCrawled
global HasDownloadpg # set()
global DownloadPgList # list
global pgCount
NewLink = url
while len(ToCrawl) > 0:
if NewLink in HasCrawled or NewLink in HasDownloadpg:
NewLink = queue.get()
continue
HasCrawled.add(NewLink)
print "HasCrawled:", len(HasCrawled), " ToCrawl:", len(ToCrawl)
print "now crawling:", NewLink
try:
pgpaser = MyHTMLParser(homepage, domain)
pghtml = pgpaser.get_page_html(NewLink)
try:
HasDownloadpg.add(NewLink)
pgCount = len(HasDownloadpg)
dwldpg = [pgCount, NewLink]
DownloadPgList.append(dwldpg)
print "Downloading...", NewLink
f = open('./ku_crawled_files/'+'%d.txt' % pgCount, 'w')
f.write(pghtml)
f.close()
# also write the pgCount + NewLink to file
for (key,val) in DownloadPgList:
fpg = open('DownloadPgList.txt', 'a')
fpg.write("%s %s \n" % (key,val))
fpg.close()
print "Successful download ", NewLink, "download page number:", len(HasDownloadpg)
except:
print "Failed to download", NewLink
continue
NewLink = queue.get()
except Exception, e:
NewLink = queue.get()
continue
# # The main function to run my web crawler
if __name__ == "__main__":
# lock = threading.Lock()
homepage = 'http://www.ku.edu'
seed = 'http://www.ku.edu'
domain = get_net_location(homepage)
global queue
global ToCrawl
global HasCrawled
global HasDownloadpg
global DownloadPgList
global pgCount
queue = Queue.Queue()
ToCrawl = set()
HasCrawled = set()
HasDownloadpg = set()
DownloadPgList = []
pgCount = 0
ThreadNum = 2
ThreadList = []
HasCrawled.add(seed)
seedpaser = MyHTMLParser(homepage, domain)
seedpghtml = seedpaser.get_page_html(seed)
for i in range(0, ThreadNum):
th = threading.Thread(target = MySpider, args = (queue.get(), ))
ThreadList.append(th)
for t in ThreadList:
t.start()
print "\n Thread Count %s" % str(threading.activeCount())
print threading.enumerate()