-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathurl.py
36 lines (29 loc) · 972 Bytes
/
url.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import urllib2
from BeautifulSoup import BeautifulSoup
import re
reps = {''' :'\''}
exp = re.compile('(http|ftp|https):\/\/([\w\-_]+(?:(?:\.[\w\-_]+)+))([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?')
def replace_all(text, dic):
for i, j in dic.iteritems():
text = text.replace(i, j)
return text
def get_url(str):
m = exp.search(str)
if m is not None:
return str[m.start():m.end()]
else:
return None
def get_title(str):
req = urllib2.Request(str, headers={'User-Agent' : "Pybot URL Title Grabber"})
try:
source = urllib2.urlopen(req)
except urllib2.HTTPError, e:
print e.code
return None
else:
headers = source.info().headers
if not any('image' in s for s in headers):
bs = BeautifulSoup(source)
title = ' '.join(bs.title.string.split())
title = replace_all(title, reps)
return title