-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfetchpapers2.py
122 lines (97 loc) · 3.7 KB
/
fetchpapers2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#will be using BeautifulSoup
import os
from bs4 import BeautifulSoup as soup
import urllib2
import re
proxy_support = urllib2.ProxyHandler({})
opener = urllib2.build_opener(proxy_support)
urllib2.install_opener(opener)
owd = os.getcwd() #saved the original woring directory
os.chdir("/home/rahul/Rahul/X/webscraping/qPapers") #I'll download the papers here
homeUrl = "http://10.17.32.9/peqp"
homeData = urllib2.urlopen(homeUrl)
sip = soup(homeData)
#now fetching links for all the years
allYears = sip.find_all("a")
allYears = allYears[1:] #the firs link is the link to parent directory
#allYears has the <a> tags stored. We want only the links
#I love list comprehensions!!
allYears = [year.get("href") for year in allYears]
#now allyears hrefas the partial urls, we'll construct the entire url
allYears = ["http://10.17.32.9"+url for url in allYears]
#print allYears
#the loop for the years will begin now
#till 2008 year/department then we get pdf
#after this year/phase/department/courses
for year in allYears:
# i need to get the directory name from the url, regexp to the rescue
yearPattern = re.compile(r"/peqp/(\d{4})/")
yearString = yearPattern.search(year).groups()[0]
print yearString
if int(yearString) <= 2007:
pass
elif int(yearString)>=2007:
if not os.path.exists(yearString):
os.makedirs(yearString)
os.chdir(yearString)
yearData = urllib2.urlopen(year)
sipYear = soup(yearData)
allPhases = sipYear.find_all("a")[1:]
allPhases = [phase.get("href") for phase in allPhases]
allPhases = ["http://10.17.32.9"+url for url in allPhases]
#print allPhases
for phase in allPhases:
phasePattern = re.compile(r"/peqp/"+yearString+"/"+r"([a-zA-Z0-9& _/,\(\)]*)/")
phaseString = phasePattern.search(urllib2.unquote(phase)).groups()[0]
#print phaseString
if not os.path.exists(phaseString):
os.makedirs(phaseString)
os.chdir(phaseString)
phaseData = urllib2.urlopen(phase)
sipPhase = soup(phaseData)
allDepartments = sipPhase.find_all("a")[1:]
allDepartments = [department.get("href") for department in allDepartments]
allDepartments = ["http://10.17.32.9"+url for url in allDepartments]
for department in allDepartments:
departmentPattern = re.compile(r"/peqp/"+yearString+"/"+phaseString+"/"+r"([a-zA-Z0-9& _,\(\)]*)")
departmentString = departmentPattern.search(urllib2.unquote(department)).groups()[0]
print department
departmentData = urllib2.urlopen(department)
if not os.path.exists(departmentString):
os.makedirs(departmentString)
os.chdir(departmentString)
sipDepartment = soup(departmentData)
allCourses = sipDepartment.find_all("a")[1:]
allCourses = [course.get("href") for course in allCourses]
count = 0
for course in allCourses:
try:
course = "http://10.17.32.9" + course
#print course
count+=1
except TypeError:
with open("unableToDownload.txt",'w') as f:
f.write(departmentString+" "+str(count))
count+=1
allCourses.remove(course)
#print allCourses
for course in allCourses:
try:
coursePattern = re.compile(r"/peqp/"+yearString+"/"+phaseString+"/"+departmentString+"/"+r"([a-zA-Z0-9& _,\(\)]*)")
courseString = coursePattern.search(urllib2.unquote(course)).groups()[0]
except AttributeError:
print "OOps! something to work upon"
#print courseString
try:
pdf = urllib2.urlopen("http://10.17.32.9"+course).read()
filename = courseString+".pdf"
with open(filename,'wb') as f:
f.write(pdf)
except UnicodeEncodeError:
with open("unableToDownload.txt",'w') as f:
f.write("yet another")
except TypeError:
"Oops!!"
os.chdir('..')
os.chdir('..')
os.chdir('..')